diff --git a/.config/1espt/PipelineAutobaseliningConfig.yml b/.config/1espt/PipelineAutobaseliningConfig.yml
index daa9b73d5971a..183d52d5c1d44 100644
--- a/.config/1espt/PipelineAutobaseliningConfig.yml
+++ b/.config/1espt/PipelineAutobaseliningConfig.yml
@@ -5,15 +5,16 @@ pipelines:
retail:
source:
credscan:
- lastModifiedDate: 2024-10-24
+ lastModifiedDate: 2024-10-25
policheck:
- lastModifiedDate: 2024-10-24
+ lastModifiedDate: 2024-10-25
eslint:
- lastModifiedDate: 2024-10-24
+ lastModifiedDate: 2024-10-25
psscriptanalyzer:
- lastModifiedDate: 2024-10-24
+ lastModifiedDate: 2024-10-25
armory:
- lastModifiedDate: 2024-10-24
+ lastModifiedDate: 2024-10-25
+ usedNonDefaultBranch: true
1299:
retail:
source:
@@ -25,6 +26,8 @@ pipelines:
lastModifiedDate: 2024-10-25
armory:
lastModifiedDate: 2024-10-25
+ policheck:
+ lastModifiedDate: 2024-10-29
binary:
credscan:
lastModifiedDate: 2024-10-25
@@ -32,3 +35,43 @@ pipelines:
lastModifiedDate: 2024-10-25
spotbugs:
lastModifiedDate: 2024-10-25
+ 1625:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-11-05
+ policheck:
+ lastModifiedDate: 2024-11-05
+ eslint:
+ lastModifiedDate: 2024-11-05
+ psscriptanalyzer:
+ lastModifiedDate: 2024-11-05
+ armory:
+ lastModifiedDate: 2024-11-05
+ binary:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ binskim:
+ lastModifiedDate: 2024-11-13
+ spotbugs:
+ lastModifiedDate: 2024-11-13
+ 1626:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ policheck:
+ lastModifiedDate: 2024-11-13
+ eslint:
+ lastModifiedDate: 2024-11-13
+ psscriptanalyzer:
+ lastModifiedDate: 2024-11-13
+ armory:
+ lastModifiedDate: 2024-11-13
+ binary:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ binskim:
+ lastModifiedDate: 2024-11-13
+ spotbugs:
+ lastModifiedDate: 2024-11-13
diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
new file mode 100644
index 0000000000000..a7ee2a4b69dda
--- /dev/null
+++ b/.config/guardian/.gdnbaselines
@@ -0,0 +1,43 @@
+{
+ "properties": {
+ "helpUri": "https://eng.ms/docs/microsoft-security/security/azure-security/cloudai-security-fundamentals-engineering/security-integration/guardian-wiki/microsoft-guardian/general/baselines"
+ },
+ "version": "1.0.0",
+ "baselines": {
+ "default": {
+ "name": "default",
+ "createdDate": "2024-11-13 00:40:35Z",
+ "lastUpdatedDate": "2024-11-13 00:40:35Z"
+ }
+ },
+ "results": {
+ "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68": {
+ "signature": "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68",
+ "alternativeSignatures": [],
+ "target": "ScanTelemetry_20241113003616898.json",
+ "line": 1,
+ "memberOf": [
+ "default"
+ ],
+ "tool": "credscan",
+ "ruleId": "CSCAN-AZURE0130",
+ "createdDate": "2024-11-13 00:40:35Z",
+ "expirationDate": "2025-05-02 01:29:47Z",
+ "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 01:29:47Z"
+ },
+ "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1": {
+ "signature": "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1",
+ "alternativeSignatures": [],
+ "target": "ScanTelemetry_20241113111547065.json",
+ "line": 1,
+ "memberOf": [
+ "default"
+ ],
+ "tool": "credscan",
+ "ruleId": "CSCAN-AZURE0130",
+ "createdDate": "2024-11-13 11:20:17Z",
+ "expirationDate": "2025-05-02 11:55:15Z",
+ "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 11:55:15Z"
+ }
+ }
+}
\ No newline at end of file
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index c704adb263db4..7cca0969a168b 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -20,18 +20,17 @@ permissions:
jobs:
build:
- runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
env:
DOCFXVERSION: 2.62.2
steps:
- uses: actions/checkout@v4
- - name: Setup .NET
- uses: actions/setup-dotnet@v4
- with:
- dotnet-version: 8.0.x
- name: Install DocFX
run: |
dotnet tool update -g docfx
+ - name: Update PATH
+ run: |
+ Add-Content -Value "$env:USERPROFILE\.dotnet\tools" -Encoding utf8 -Path $env:GITHUB_PATH
# NOTE: We need to restore Microsoft.ML.OnnxRuntime.csproj manually to set IncludeMobileTargets=false
# docfx doesn't seem to be able to do that properly resulting in build errors
- name: Restore dependencies
@@ -50,10 +49,12 @@ jobs:
- name: Log source commit
run: git rev-parse --short HEAD > csharp/ApiDocs/csharp/source-version.txt
- name: Move C# docs into site
+ shell: pwsh
run: |
- mkdir -p _site/docs/api
- rm -rf _site/docs/api/csharp
- mv csharp/ApiDocs/csharp _site/docs/api/csharp
+ New-Item -Path _site/docs/api -Force -ItemType "Directory" | Out-Null
+ $OutputDirectory="_site/docs/api/csharp"
+ if (Test-Path $OutputDirectory) { Remove-Item -Recurse -Force $OutputDirectory }
+ Move-Item -Path csharp\ApiDocs\csharp -Destination $OutputDirectory
- name: Upload docs artifact
uses: actions/upload-artifact@v4
with:
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
new file mode 100644
index 0000000000000..12c1c7be0d773
--- /dev/null
+++ b/CPPLINT.cfg
@@ -0,0 +1 @@
+filter=-whitespace
diff --git a/README.md b/README.md
index 8452e26a58d4d..f1817282b61a0 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@
|System|Inference|Training|
|---|---|---|
-|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)||
-|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining/orttraining-ortmodule-distributed?label=Training+Distributed)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=148)|
+|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CUDA%20CI%20Pipeline?label=Windows+GPU+CUDA)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=218)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20WebGPU%20CI%20Pipeline?label=Windows+GPU+WebGPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=228)||
+|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|
|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)||
|Android|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)||
|iOS|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)||
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 20142e734dfac..26084ab42ec1c 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -2108,261 +2108,6 @@ SOFTWARE.
_____
-TVM Open Deep Learning Compiler Stack
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "{}"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright {yyyy} {name of copyright owner}
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-CONTRIBUTORS
-
-TVM Contributors
-================
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
-contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
-
-See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
-
-## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
-- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
-
-## Reviewers
-- [Zhi Chen](https://github.com/zhiics)
-- [Xiaoqiang Dan](https://github.com/xqdan)
-- [Liangfu Chen](https://github.com/liangfu)
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Jared Roesch](https://github.com/jroesch)
-- [Siva](https://github.com/srkreddy1238)
-- [Siju Samuel](https://github.com/siju-samuel)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Yao Wang](https://github.com/kevinthesun)
-- [Jian Weng](https://github.com/were)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
-
-## List of Contributors
-- [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
- - To contributors: please add your name to the list.
-- [Qiao Zhang](https://github.com/zhangqiaorjc)
-- [Haolong Zhang](https://github.com/haolongzhangm)
-- [Cody Hao Yu](https://github.com/comaniac)
-- [Chris Nuernberger](https://github.com/cnuernber)
-
-_____
-
FreeBSD: getopt.c file
Copyright (c) 1987, 1993, 1994
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 1432193ac9080..46349f43923e2 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -1,578 +1,508 @@
{
- "$schema": "https://json.schemastore.org/component-detection-manifest.json",
- "Registrations": [
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
- "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
- }
- }
- },
- {
- "component": {
- "Type": "maven",
- "maven": {
- "GroupId": "org.junit.platform",
- "ArtifactId": "junit-platform-console-standalone",
- "Version": "1.6.2"
- },
- "DevelopmentDependency": true
- }
- },
- {
- "component": {
- "Type": "maven",
- "maven": {
- "GroupId": "com.google.protobuf",
- "ArtifactId": "protobuf-java",
- "Version": "3.21.7"
- },
- "DevelopmentDependency": true
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "2379917985919ed3918dc12cad47f469f245be7a",
- "repositoryUrl": "https://github.com/apache/tvm.git"
- },
- "comments": "needed for TVM EP"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "cabe04d6d6b05356fa8f9741704924788f0dd762",
- "repositoryUrl": "https://github.com/agauniyal/rang.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "a3bcc6981d5dad3afb212689e2c7853d1b1ee45d",
- "repositoryUrl": "https://github.com/NVIDIA/cutlass.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "08f7c7e69f8ea61a0c4151359bc8023be8e9217b",
- "repositoryUrl": "https://github.com/tlc-pack/libbacktrace.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "36a91576edf633479c78649e050f18dd2ddc8103",
- "repositoryUrl": "https://github.com/apache/incubator-tvm-vta.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "111c9be5188f7350c2eac9ddaedd8cca3d7bf394",
- "repositoryUrl": "https://github.com/kazuho/picojson.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b5e4186d7ab63458e79084842dced166be2ca5b5",
- "repositoryUrl": "https://github.com/lammertb/libcrc.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
- "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
- "repositoryUrl": "https://github.com/dmlc/dlpack.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
- "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
- "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
- "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
- "repositoryUrl": "https://github.com/glennrp/libpng.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
- "repositoryUrl": "https://github.com/KjellKod/g3log.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
- "repositoryUrl": "https://github.com/madler/zlib.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
- "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
- "repositoryUrl": "https://github.com/numpy/numpy.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
- "repositoryUrl": "https://github.com/pytorch/pytorch.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
- "repositoryUrl": "https://github.com/pytorch/pytorch.git"
- },
- "comments": "pytorch 1.6 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
- "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
- "repositoryUrl": "https://github.com/BVLC/caffe.git"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "LLVM",
- "Version": "9.0.0",
- "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "FreeBSD GetOpt",
- "Version": "12.0.0",
- "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "Boost",
- "Version": "1.69.0",
- "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
- }
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
- "repositoryUrl": "https://github.com/grpc/grpc"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
- "repositoryUrl": "https://github.com/google/boringssl.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
- "repositoryUrl": "https://github.com/c-ares/c-ares.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
- "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
- "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
- "repositoryUrl": "https://github.com/google/upb.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "type": "other",
- "Other": {
- "Name": "Go",
- "Version": "1.12.6",
- "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "OpenMPI",
- "Version": "4.0.0",
- "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "OpenMPI",
- "Version": "4.0.4",
- "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
- },
- "comments": "openmpi 4.0.4 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "Type": "git",
- "git": {
- "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
- "repositoryUrl": "https://github.com/mpi4py/mpi4py"
- },
- "comments": "mpi4py 3.0.3 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "NCCL",
- "Version": "2.4.8",
- "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
- "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
- },
- "comments": "used by onnxruntime"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
- "repositoryUrl": "https://github.com/openucx/ucx"
- },
- "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
- "repositoryUrl": "https://github.com/nodejs/node"
- },
- "comments": "For Nodejs binding"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
- "repositoryUrl": "https://github.com/pypa/manylinux"
- },
- "comments": "For building our CI build docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c974557598645360fbabac71352b083117e3cc17",
- "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
- },
- "comments": "CMake 3.24.3. For building our CI build docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.9.1"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.8.7"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.7.9"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.6.12"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.5.10"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "transformers",
- "Version": "4.38.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "msgpack",
- "Version": "1.0.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "tensorboardX",
- "Version": "1.8"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "tensorboard",
- "Version": "2.3.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
- "repositoryUrl": "https://github.com/aappleby/smhasher"
- },
- "comments": "MurmurHash3"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
- "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
- },
- "comments": "gtest-ios-framework"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
- "repositoryUrl": "https://github.com/dmlc/dlpack.git"
- },
- "comments": "dlpack"
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "SQLite3",
- "Version": "3.22.0",
- "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
- "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
- },
- "comments": "dlfcn-win32"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
- "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
- },
- "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
- "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
- }
- }
- }
- ],
- "Version": 1
+ "$schema": "https://json.schemastore.org/component-detection-manifest.json",
+ "Registrations": [
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
+ "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "maven",
+ "maven": {
+ "GroupId": "org.junit.platform",
+ "ArtifactId": "junit-platform-console-standalone",
+ "Version": "1.6.2"
+ },
+ "DevelopmentDependency": true
+ }
+ },
+ {
+ "component": {
+ "Type": "maven",
+ "maven": {
+ "GroupId": "com.google.protobuf",
+ "ArtifactId": "protobuf-java",
+ "Version": "3.21.7"
+ },
+ "DevelopmentDependency": true
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
+ "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
+ "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
+ "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
+ "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
+ "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
+ "repositoryUrl": "https://github.com/glennrp/libpng.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
+ "repositoryUrl": "https://github.com/KjellKod/g3log.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
+ "repositoryUrl": "https://github.com/madler/zlib.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
+ "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
+ "repositoryUrl": "https://github.com/numpy/numpy.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
+ "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
+ "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+ },
+ "comments": "pytorch 1.6 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
+ "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
+ "repositoryUrl": "https://github.com/BVLC/caffe.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "LLVM",
+ "Version": "9.0.0",
+ "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "FreeBSD GetOpt",
+ "Version": "12.0.0",
+ "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "Boost",
+ "Version": "1.69.0",
+ "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
+ }
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
+ "repositoryUrl": "https://github.com/grpc/grpc"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
+ "repositoryUrl": "https://github.com/google/boringssl.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
+ "repositoryUrl": "https://github.com/c-ares/c-ares.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
+ "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
+ "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
+ "repositoryUrl": "https://github.com/google/upb.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "type": "other",
+ "Other": {
+ "Name": "Go",
+ "Version": "1.12.6",
+ "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "OpenMPI",
+ "Version": "4.0.0",
+ "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "OpenMPI",
+ "Version": "4.0.4",
+ "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
+ },
+ "comments": "openmpi 4.0.4 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "Type": "git",
+ "git": {
+ "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
+ "repositoryUrl": "https://github.com/mpi4py/mpi4py"
+ },
+ "comments": "mpi4py 3.0.3 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "NCCL",
+ "Version": "2.4.8",
+ "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
+ "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
+ },
+ "comments": "used by onnxruntime"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
+ "repositoryUrl": "https://github.com/openucx/ucx"
+ },
+ "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
+ "repositoryUrl": "https://github.com/nodejs/node"
+ },
+ "comments": "For Nodejs binding"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
+ "repositoryUrl": "https://github.com/pypa/manylinux"
+ },
+ "comments": "For building our CI build docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c974557598645360fbabac71352b083117e3cc17",
+ "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
+ },
+ "comments": "CMake 3.24.3. For building our CI build docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.9.1"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.8.7"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.7.9"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.6.12"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.5.10"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "transformers",
+ "Version": "4.38.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "msgpack",
+ "Version": "1.0.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "tensorboardX",
+ "Version": "1.8"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "tensorboard",
+ "Version": "2.3.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
+ "repositoryUrl": "https://github.com/aappleby/smhasher"
+ },
+ "comments": "MurmurHash3"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
+ "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
+ },
+ "comments": "gtest-ios-framework"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
+ "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+ },
+ "comments": "dlpack"
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "SQLite3",
+ "Version": "3.22.0",
+ "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
+ "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
+ },
+ "comments": "dlfcn-win32"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
+ "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
+ },
+ "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
+ "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
+ }
+ }
+ }
+ ],
+ "Version": 1
}
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index df27fa5ab1b95..07dff50f9a3bd 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -346,7 +346,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "511eb80847afe6bded34ec491a38d5d78ba2d604",
+ "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6",
"repositoryUrl": "https://github.com/google/dawn.git"
},
"comments": "dawn"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 31ebf58b03152..7710ab2f4cac7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -102,7 +102,6 @@ option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to provide eigen_SOURCE_PATH if turn this on." OFF)
option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
-option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
@@ -145,14 +144,11 @@ option(onnxruntime_USE_TELEMETRY "Build with Telemetry" OFF)
cmake_dependent_option(onnxruntime_USE_MIMALLOC "Override new/delete and arena allocator with mimalloc" OFF "WIN32;NOT onnxruntime_USE_CUDA;NOT onnxruntime_USE_OPENVINO" OFF)
option(onnxruntime_USE_CANN "Build with CANN support" OFF)
option(onnxruntime_USE_ROCM "Build with AMD GPU support" OFF)
-option(onnxruntime_USE_TVM "Build with TVM support" OFF)
-option(onnxruntime_TVM_CUDA_RUNTIME "Build TVM with CUDA support" OFF)
-option(onnxruntime_TVM_USE_LLVM "Build TVM with LLVM. Set customized path to llvm-config.exe here if need" OFF)
-option(onnxruntime_TVM_USE_HASH "Build ipp-crypto library for support hash algorithm. It is defined for TVM only")
option(onnxruntime_USE_XNNPACK "Build with XNNPACK support. Provides an alternative math library on ARM, WebAssembly and x86." OFF)
option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware acceleration in web browsers." OFF)
option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
+option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
# Options related to reducing the binary size produced by the build
# XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -257,6 +253,7 @@ cmake_dependent_option(MSVC_Z7_OVERRIDE "replacing /Zi and /ZI with /Z7 when usi
option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
+option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
# ENABLE_TRAINING includes all training functionality
# The following 2 entry points
@@ -906,11 +903,6 @@ if (onnxruntime_USE_SNPE)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES snpe)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_SNPE=1)
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND ORT_PROVIDER_FLAGS -DUSE_TVM=1)
- list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_TVM=1)
- list(APPEND ONNXRUNTIME_PROVIDER_NAMES tvm)
-endif()
if (onnxruntime_USE_WINML)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_WINML=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WINML=1)
@@ -981,6 +973,10 @@ if (onnxruntime_USE_LOCK_FREE_QUEUE)
add_compile_definitions(USE_LOCK_FREE_QUEUE)
endif()
+if (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+ add_compile_definitions(FORCE_GENERIC_ALGORITHMS)
+endif()
+
if (onnxruntime_ENABLE_LAZY_TENSOR)
# To support LazyTensor, ORT needs to call Python function from C/C++.
# so onnxruntime_ENABLE_PYTHON is required.
@@ -1313,50 +1309,6 @@ if (onnxruntime_USE_DNNL)
add_compile_definitions(DNNL_OPENMP)
endif()
-# TVM EP
-if (onnxruntime_USE_TVM)
- if (NOT TARGET tvm)
- message(STATUS "Include TVM(*).")
- include(tvm)
- endif()
-
- # ipp-crypto
- if (onnxruntime_TVM_USE_HASH)
- message(STATUS "Include ipp-crypto(*).")
- include(ipp-crypto)
- endif()
-
- # TVM
- if (onnxruntime_TVM_USE_LLVM)
- set(USE_LLVM "${onnxruntime_TVM_USE_LLVM}" CACHE STRING "Path to LLVM for correct TVM build")
- elseif(onnxruntime_USE_LLVM)
- set(USE_LLVM ON CACHE BOOL "Only defined for TVM")
- endif()
-
- if (onnxruntime_TVM_CUDA_RUNTIME)
- set(USE_CUDA ON CACHE BOOL "Only defined for TVM" FORCE)
- endif()
-
- # TODO(vvchernov): customized tvm logger is hidden due to the issue on TVM side (https://github.com/apache/tvm/issues/10139)
- # add_compile_definitions(TVM_LOG_CUSTOMIZE=1)
- # add_library(tvm_custom_logger STATIC ${ONNXRUNTIME_ROOT}/core/providers/tvm/custom_logging.cc)
-
- set(USE_OPENMP gnu CACHE STRING "Only defined for TVM")
- add_subdirectory(${tvm_SOURCE_DIR} ${tvm_BINARY_DIR} EXCLUDE_FROM_ALL)
-
- set_target_properties(tvm PROPERTIES FOLDER ${tvm_SOURCE_DIR})
- # target_link_libraries(tvm PUBLIC tvm_custom_logger)
-
- set(TVM_INCLUDES ${tvm_SOURCE_DIR}/include
- ${tvm_SOURCE_DIR}/3rdparty/dmlc-core/include
- ${tvm_SOURCE_DIR}/3rdparty/dlpack/include
- $)
-
- set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
- list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
-endif()
-
# onnxruntime-extensions
if (onnxruntime_USE_EXTENSIONS)
include(extensions)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9cf92bf417fcb..21f9ee1701c46 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -58,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d839
composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/511eb80847afe6bded34ec491a38d5d78ba2d604.zip;c493f5aca5586f6634e25d0121c85df71189fb99
+dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/v0.2.0/kleidiai-v0.2.0.zip;B1E3173992FD91F20DB904AB77D6E901778C2681
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index d9e833a2d8cd4..ee7abcbad025c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -615,12 +615,25 @@ if (onnxruntime_USE_COREML)
endif()
if (onnxruntime_USE_WEBGPU)
- FetchContent_Declare(
- dawn
- URL ${DEP_URL_dawn}
- URL_HASH SHA1=${DEP_SHA1_dawn}
- PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
- )
+ if (onnxruntime_CUSTOM_DAWN_SRC_PATH)
+ # use the custom dawn source path if provided
+ #
+ # specified as:
+ # build.py --use_webgpu --cmake_extra_defines "onnxruntime_CUSTOM_DAWN_SRC_PATH="
+ FetchContent_Declare(
+ dawn
+ SOURCE_DIR ${onnxruntime_CUSTOM_DAWN_SRC_PATH}
+ )
+ else()
+ FetchContent_Declare(
+ dawn
+ URL ${DEP_URL_dawn}
+ URL_HASH SHA1=${DEP_SHA1_dawn}
+ # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now.
+ # if we need to apply patches in the future, we can uncomment the following line.
+ # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+ )
+ endif()
# use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake
deleted file mode 100644
index 93049c8b85853..0000000000000
--- a/cmake/external/tvm.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-if (onnxruntime_USE_TVM)
- message(STATUS "onnxruntime_USE_TVM: Fetch tvm for TVM EP")
-
- FetchContent_Declare(
- tvm
- GIT_REPOSITORY https://github.com/apache/tvm.git
- GIT_TAG 2379917985919ed3918dc12cad47f469f245be7a
- )
-
- FetchContent_GetProperties(tvm)
- if(NOT tvm_POPULATED)
- FetchContent_Populate(tvm)
- if (WIN32)
- execute_process(
- COMMAND ${CMAKE_COMMAND} -E create_symlink ${tvm_BINARY_DIR}/${CMAKE_BUILD_TYPE} ${tvm_SOURCE_DIR}/build
- )
- else()
- file(CREATE_LINK ${tvm_BINARY_DIR} ${tvm_SOURCE_DIR}/build SYMBOLIC)
- endif()
- endif()
-
- set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include)
-
-endif()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 1278bb7dc9e7e..732c0511d400f 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -123,7 +123,11 @@ else()
onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
endif()
if(NOT APPLE)
- target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+ include(CheckLinkerFlag)
+ check_linker_flag(CXX "LINKER:-rpath=\$ORIGIN" LINKER_SUPPORT_RPATH)
+ if(LINKER_SUPPORT_RPATH)
+ target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+ endif()
endif()
endif()
@@ -206,7 +210,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
${PROVIDERS_NNAPI}
${PROVIDERS_QNN}
${PROVIDERS_SNPE}
- ${PROVIDERS_TVM}
${PROVIDERS_RKNPU}
${PROVIDERS_VSINPU}
${PROVIDERS_XNNPACK}
@@ -217,7 +220,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
${onnxruntime_winml}
onnxruntime_optimizer
onnxruntime_providers
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_graph
diff --git a/cmake/onnxruntime_codegen_tvm.cmake b/cmake/onnxruntime_codegen_tvm.cmake
deleted file mode 100644
index 7b50d8f8603ae..0000000000000
--- a/cmake/onnxruntime_codegen_tvm.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-file(GLOB_RECURSE onnxruntime_codegen_common_srcs
- "${ONNXRUNTIME_ROOT}/core/codegen/common/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/common/*.cc"
-)
-
-file(GLOB_RECURSE onnxruntime_codegen_tvm_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.cc"
- "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.cc"
-)
-
-source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-
-#onnxruntime_codegen_tvm depends on onnxruntime framework
-onnxruntime_add_static_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-set_target_properties(onnxruntime_codegen_tvm PROPERTIES FOLDER "ONNXRuntime")
-target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES} ${MKLML_INCLUDE_DIR} ${eigen_INCLUDE_DIRS})
-onnxruntime_add_include_to_target(onnxruntime_codegen_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11)
-target_compile_options(onnxruntime_codegen_tvm PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
-# need onnx to build to create headers that this project includes
-add_dependencies(onnxruntime_codegen_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
diff --git a/cmake/onnxruntime_csharp.cmake b/cmake/onnxruntime_csharp.cmake
index 22c993d07f7f9..39533429e181c 100644
--- a/cmake/onnxruntime_csharp.cmake
+++ b/cmake/onnxruntime_csharp.cmake
@@ -30,10 +30,6 @@ if (onnxruntime_USE_NNAPI_BUILTIN)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_NNAPI;")
endif()
-if (onnxruntime_USE_TVM)
- STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_TVM,")
-endif()
-
if (onnxruntime_USE_OPENVINO)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_OPENVINO;")
endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 20bb1fb772189..5124262ec0004 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -36,11 +36,13 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/qpostprocessor.cpp
${MLAS_SRC_DIR}/qlgavgpool.cpp
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm.h
- ${MLAS_SRC_DIR}/sqnbitgemm.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm.h
+ ${MLAS_SRC_DIR}/qnbitgemm.cpp
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
${MLAS_SRC_DIR}/flashattn.cpp
${MLAS_SRC_DIR}/cast.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding.h
+ ${MLAS_SRC_DIR}/rotary_embedding.cpp
)
target_sources(onnxruntime_mlas PRIVATE
@@ -84,11 +86,15 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
- ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+ ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
)
set(mlas_platform_preprocess_srcs
@@ -362,10 +368,12 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -383,7 +391,9 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
- ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+ ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -393,7 +403,9 @@ else()
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
- set_source_files_properties(${MLAS_SRC_DIR}/fp16_neon_common.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
endif()
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -453,7 +465,6 @@ else()
bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
return 0;
}
- }
#endif"
HAS_P10_RUNTIME
)
@@ -677,6 +688,13 @@ endif()
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs
"${MLAS_SRC_DIR}/scalar/*.cpp")
+ elseif (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+ file(GLOB_RECURSE mlas_platform_srcs_generic
+ "${MLAS_SRC_DIR}/scalar/*.cpp")
+ set(mlas_platform_srcs
+ ${mlas_platform_srcs}
+ ${mlas_platform_srcs_generic}
+ )
endif()
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 9666877cdc206..582491de9503d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -101,9 +101,6 @@ endif()
if(onnxruntime_USE_ROCM)
set(PROVIDERS_ROCM onnxruntime_providers_rocm)
endif()
-if (onnxruntime_USE_TVM)
- set(PROVIDERS_TVM onnxruntime_providers_tvm)
-endif()
if (onnxruntime_USE_XNNPACK)
set(PROVIDERS_XNNPACK onnxruntime_providers_xnnpack)
endif()
@@ -194,10 +191,6 @@ if (onnxruntime_USE_ROCM)
include(onnxruntime_providers_rocm.cmake)
endif()
-if (onnxruntime_USE_TVM)
- include(onnxruntime_providers_tvm.cmake)
-endif()
-
if (onnxruntime_USE_VSINPU)
include(onnxruntime_providers_vsinpu.cmake)
endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 39ad530146b33..4f86717026118 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -224,8 +224,7 @@
include(cutlass)
target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
- target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
- PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+ target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 5dcee285a5b13..e500957f864f8 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -11,22 +11,22 @@
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
- if (WIN32)
- set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release)
- endif()
-
# Header paths
find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
- if(OpenVINO_VERSION VERSION_LESS 2024.0)
- message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
+ if(OpenVINO_VERSION VERSION_LESS 2024.3)
+ message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release")
endif()
if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
add_definitions(-DUSE_OVEP_NPU_MEMORY=1)
endif()
- if (WIN32)
- unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
+ # If building RelWithDebInfo and OV package does not have that configuration map to Release
+ get_target_property(ov_rt_implib_rwdi openvino::runtime IMPORTED_IMPLIB_RELWITHDEBINFO)
+ if ((CMAKE_BUILD_TYPE STREQUAL RelWithDebInfo) AND NOT ov_rt_implib_rwdi)
+ set_target_properties(openvino::runtime PROPERTIES
+ MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release
+ )
endif()
list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
@@ -82,3 +82,8 @@
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
+
+set_target_properties(onnxruntime_providers_openvino PROPERTIES
+ MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo
+ MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo
+ )
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_tvm.cmake b/cmake/onnxruntime_providers_tvm.cmake
deleted file mode 100644
index 8fd50c70dd5d7..0000000000000
--- a/cmake/onnxruntime_providers_tvm.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
- add_definitions(-DUSE_TVM=1)
- if (onnxruntime_TVM_USE_HASH)
- add_definitions(-DUSE_TVM_HASH=1)
- endif()
-
- if (onnxruntime_TVM_USE_HASH)
- file (GLOB_RECURSE onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
- )
- else()
- file (GLOB onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
- )
- endif()
-
- source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_tvm_cc_srcs})
- onnxruntime_add_static_library(onnxruntime_providers_tvm ${onnxruntime_providers_tvm_cc_srcs})
-
- if ( CMAKE_COMPILER_IS_GNUCC )
- target_compile_options(onnxruntime_providers_tvm PRIVATE -Wno-unused-parameter -Wno-missing-field-initializers)
- endif()
-
- target_include_directories(onnxruntime_providers_tvm PRIVATE
- ${TVM_INCLUDES}
- ${PYTHON_INCLUDE_DIRS})
- onnxruntime_add_include_to_target(onnxruntime_providers_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-
- add_dependencies(onnxruntime_providers_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
-
- if (onnxruntime_TVM_USE_HASH)
- add_dependencies(onnxruntime_providers_tvm ippcp_s)
- target_include_directories(onnxruntime_providers_tvm PRIVATE ${IPP_CRYPTO_INCLUDE_DIR})
- target_link_libraries(onnxruntime_providers_tvm PRIVATE ippcp_s)
- endif()
-
- set_target_properties(onnxruntime_providers_tvm PROPERTIES FOLDER "ONNXRuntime")
- set_target_properties(onnxruntime_providers_tvm PROPERTIES LINKER_LANGUAGE CXX)
-
- if (WIN32 AND MSVC)
- # wd4100: identifier' : unreferenced formal parameter
- # wd4127: conditional expression is constant
- # wd4244: conversion from 'int' to 'char', possible loss of data
- # TODO: 4244 should not be disabled
- target_compile_options(onnxruntime_providers_tvm PRIVATE "/wd4100" "/wd4127" "/wd4244")
- else()
- target_compile_options(onnxruntime_providers_tvm PRIVATE "-Wno-error=type-limits")
- endif()
- target_compile_definitions(onnxruntime_providers_tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=)
-
- install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
-
- if (NOT onnxruntime_BUILD_SHARED_LIB)
- install(TARGETS onnxruntime_providers_tvm
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
- FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
- endif()
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index 764cde9491da8..561a323533f48 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -12,6 +12,7 @@
file(GLOB onnxruntime_providers_vitisai_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/vitisai/include/vaip/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 7239b245a7245..5a87252b08573 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -169,8 +169,8 @@ endif()
target_link_libraries(onnxruntime_pybind11_state PRIVATE
onnxruntime_session
${onnxruntime_libs}
- ${PROVIDERS_TVM}
${PROVIDERS_NNAPI}
+ ${PROVIDERS_VSINPU}
${PROVIDERS_XNNPACK}
${PROVIDERS_COREML}
${PROVIDERS_RKNPU}
@@ -184,7 +184,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
onnxruntime_optimizer
onnxruntime_providers
onnxruntime_util
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_util
@@ -965,37 +964,6 @@ if (onnxruntime_USE_ROCM)
)
endif()
-if (onnxruntime_USE_TVM)
- file(GLOB onnxruntime_python_providers_tvm_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/python/providers/tvm/*.py"
- )
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers
- COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers/tvm
- COMMAND ${CMAKE_COMMAND} -E copy
- ${onnxruntime_python_providers_tvm_srcs}
- $/onnxruntime/providers/tvm
- COMMAND ${CMAKE_COMMAND} -E copy
- $
- $/onnxruntime/capi/
- )
-
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- WORKING_DIRECTORY ${tvm_SOURCE_DIR}/python
- COMMAND ${Python_EXECUTABLE} setup.py bdist_wheel
- )
-
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- COMMAND ${Python_EXECUTABLE}
- $/onnxruntime/providers/tvm/extend_python_file.py
- --target_file $/onnxruntime/capi/_ld_preload.py
- )
-
-endif()
-
if (onnxruntime_USE_DML)
if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${onnxruntime_target_platform}-win/${DML_SHARED_LIB})
@@ -1051,4 +1019,13 @@ if (onnxruntime_USE_QNN)
endif()
endif()
+if (onnxruntime_USE_VSINPU)
+ add_custom_command(
+ TARGET onnxruntime_pybind11_state POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ $
+ $/onnxruntime/capi/
+ )
+endif()
+
endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 561f65a33b89c..e822f0a3655fc 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -9,9 +9,6 @@ set(TEST_INC_DIR ${ONNXRUNTIME_ROOT})
if (onnxruntime_ENABLE_TRAINING)
list(APPEND TEST_INC_DIR ${ORTTRAINING_ROOT})
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND TEST_INC_DIR ${TVM_INCLUDES})
-endif()
set(disabled_warnings)
function(AddTest)
@@ -67,7 +64,10 @@ function(AddTest)
if(onnxruntime_USE_CUDA)
#XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
# otherwise it will impact when CUDA DLLs can be unloaded.
- target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart cudnn_frontend)
+ target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
+ if(NOT onnxruntime_CUDA_MINIMAL)
+ target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend)
+ endif()
endif()
target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
endif()
@@ -111,7 +111,6 @@ function(AddTest)
endif()
target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings})
else()
- target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options -Wno-error=sign-compare>"
"$<$>:-Wno-error=sign-compare>")
if (${HAS_NOERROR})
@@ -641,13 +640,11 @@ set(ONNXRUNTIME_TEST_LIBS
${PROVIDERS_ACL}
${PROVIDERS_ARMNN}
${PROVIDERS_COREML}
- # ${PROVIDERS_TVM}
${PROVIDERS_XNNPACK}
${PROVIDERS_AZURE}
onnxruntime_optimizer
onnxruntime_providers
onnxruntime_util
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_util
@@ -749,12 +746,6 @@ if(onnxruntime_USE_AZURE)
list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_azure)
endif()
-if(WIN32)
- if (onnxruntime_USE_TVM)
- list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM})
- endif()
-endif()
-
file(GLOB onnxruntime_test_framework_src CONFIGURE_DEPENDS
${onnxruntime_test_framework_src_patterns}
)
@@ -855,9 +846,6 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND all_tests ${onnxruntime_test_training_api_src})
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND all_tests ${onnxruntime_test_tvm_src})
-endif()
if (onnxruntime_USE_OPENVINO)
list(APPEND all_tests ${onnxruntime_test_openvino_src})
@@ -1089,15 +1077,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $
)
endif()
- if(WIN32)
- if (onnxruntime_USE_TVM)
- add_custom_command(
- TARGET ${test_data_target} POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy $ $
- )
- endif()
- endif()
-
if(WIN32)
set(wide_get_opt_src_dir ${TEST_SRC_DIR}/win_getopt/wide)
onnxruntime_add_static_library(win_getopt_wide ${wide_get_opt_src_dir}/getopt.cc ${wide_get_opt_src_dir}/include/getopt.h)
@@ -1139,12 +1118,6 @@ if (NOT IOS)
endif()
set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
- if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
- endif()
- endif()
-
install(TARGETS onnx_test_runner
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -1298,11 +1271,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()
set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
- if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
- endif()
- endif()
endif()
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
deleted file mode 100644
index 7a2a01d55be46..0000000000000
--- a/cmake/patches/dawn/dawn.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
-index 9c0bd6fa4e..bf8a57aeac 100644
---- a/src/dawn/native/CMakeLists.txt
-+++ b/src/dawn/native/CMakeLists.txt
-@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER)
- target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER")
- endif()
-
-+if (IOS)
-+ target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc)
-+ target_compile_options(dawn_native PRIVATE -fno-objc-arc)
-+endif()
-+
- if (DAWN_BUILD_MONOLITHIC_LIBRARY)
- ###############################################################################
- # Do the 'complete_lib' build.
-diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm
-index ce55acbd43..2cfd363479 100644
---- a/src/dawn/native/Surface_metal.mm
-+++ b/src/dawn/native/Surface_metal.mm
-@@ -33,10 +33,18 @@
-
- #import
-
-+#include "dawn/common/Platform.h"
-+
- namespace dawn::native {
-
- bool InheritsFromCAMetalLayer(void* obj) {
-- id object = static_cast(obj);
-+ id object =
-+#if DAWN_PLATFORM_IS(IOS)
-+ (__bridge id)obj;
-+#else // DAWN_PLATFORM_IS(IOS)
-+ static_cast(obj);
-+#endif // DAWN_PLATFORM_IS(IOS)
-+
- return [object isKindOfClass:[CAMetalLayer class]];
- }
-
-diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm
-index bde8bfea07..8906185d6f 100644
---- a/src/dawn/native/metal/SharedFenceMTL.mm
-+++ b/src/dawn/native/metal/SharedFenceMTL.mm
-@@ -25,6 +25,8 @@
- // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-+#include "dawn/common/Platform.h"
-+
- #include "dawn/native/metal/SharedFenceMTL.h"
-
- #include "dawn/native/ChainUtils.h"
-@@ -39,8 +41,13 @@ ResultOrError[> SharedFence::Create(
- const SharedFenceMTLSharedEventDescriptor* descriptor) {
- DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing.");
- if (@available(macOS 10.14, iOS 12.0, *)) {
-- return AcquireRef(new SharedFence(
-- device, label, static_cast>(descriptor->sharedEvent)));
-+ return AcquireRef(new SharedFence(device, label,
-+#if DAWN_PLATFORM_IS(IOS)
-+ (__bridge id)(descriptor->sharedEvent)
-+#else // DAWN_PLATFORM_IS(IOS)
-+ static_cast>(descriptor->sharedEvent)
-+#endif // DAWN_PLATFORM_IS(IOS)
-+ ));
- } else {
- return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported.");
- }
-diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
-index 0037d83276..6372c4ee77 100644
---- a/src/tint/api/BUILD.cmake
-+++ b/src/tint/api/BUILD.cmake
-@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib
- tint_lang_wgsl_ast_transform
- tint_lang_wgsl_common
- tint_lang_wgsl_features
-+ tint_lang_wgsl_inspector
- tint_lang_wgsl_program
- tint_lang_wgsl_sem
- tint_lang_wgsl_writer_ir_to_program
diff --git a/csharp/ApiDocs/docfx.json b/csharp/ApiDocs/docfx.json
index 0671d4aeb7d95..88a3283ad76e8 100644
--- a/csharp/ApiDocs/docfx.json
+++ b/csharp/ApiDocs/docfx.json
@@ -14,7 +14,7 @@
"disableDefaultFilter": false,
"noRestore": true,
"properties": {
- "AllowUnsafeBlocks": true,
+ "AllowUnsafeBlocks": "true",
"TargetFramework": "net8.0",
"Nullable": "enable",
"LangVersion": "8.0",
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index be157a0419fc0..d628b065ceaa7 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -1142,9 +1142,6 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
[DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)]
public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_MIGraphX(IntPtr /*(OrtSessionOptions*)*/ options, int device_id);
-
- [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)]
- public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_Tvm(IntPtr /*(OrtSessionOptions*) */ options, byte[] /*(char char*)*/ settings);
#endif
///
/// Append a TensorRT EP instance (configured based on given provider options) to the native OrtSessionOptions instance
@@ -1272,7 +1269,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
///
/// Append an execution provider instance to the native OrtSessionOptions instance.
///
- /// 'SNPE' and 'XNNPACK' are currently supported as providerName values.
+ /// 'SNPE', 'XNNPACK' and 'CoreML' are currently supported as providerName values.
///
/// The number of providerOptionsKeys must match the number of providerOptionsValues and equal numKeys.
///
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 3acd84b3016de..bd450451a1265 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -146,27 +146,6 @@ public static SessionOptions MakeSessionOptionWithTensorrtProvider(OrtTensorRTPr
}
}
- ///
- /// A helper method to construct a SessionOptions object for TVM execution.
- /// Use only if you have the onnxruntime package specific to this Execution Provider.
- ///
- /// settings string, comprises of comma separated key:value pairs. default is empty
- /// A SessionsOptions() object configured for execution with TVM
- public static SessionOptions MakeSessionOptionWithTvmProvider(String settings = "")
- {
- SessionOptions options = new SessionOptions();
- try
- {
- options.AppendExecutionProvider_Tvm(settings);
- return options;
- }
- catch (Exception)
- {
- options.Dispose();
- throw;
- }
- }
-
///
/// A helper method to construct a SessionOptions object for ROCM execution.
/// Use only if ROCM is installed and you have the onnxruntime package specific to this Execution Provider.
@@ -397,20 +376,6 @@ public void AppendExecutionProvider_CoreML(CoreMLFlags coremlFlags = CoreMLFlags
#endif
}
- ///
- /// Use only if you have the onnxruntime package specific to this Execution Provider.
- ///
- /// string with TVM specific settings
- public void AppendExecutionProvider_Tvm(string settings = "")
- {
-#if __MOBILE__
- throw new NotSupportedException("The TVM Execution Provider is not supported in this build");
-#else
- var utf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(settings);
- NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_Tvm(handle, utf8));
-#endif
- }
-
private class ExecutionProviderAppender
{
private byte[] _utf8ProviderName;
@@ -430,16 +395,10 @@ public IntPtr Appender(IntPtr handle, IntPtr[] optKeys, IntPtr[] optValues, UInt
///
/// Append QNN, SNPE or XNNPACK execution provider
///
- /// Execution provider to add. 'QNN', 'SNPE' or 'XNNPACK' are currently supported.
+ /// Execution provider to add. 'QNN', 'SNPE' 'XNNPACK', 'CoreML and 'AZURE are currently supported.
/// Optional key/value pairs to specify execution provider options.
public void AppendExecutionProvider(string providerName, Dictionary providerOptions = null)
{
- if (providerName != "SNPE" && providerName != "XNNPACK" && providerName != "QNN" && providerName != "AZURE")
- {
- throw new NotSupportedException(
- "Only QNN, SNPE, XNNPACK and AZURE execution providers can be enabled by this method.");
- }
-
if (providerOptions == null)
{
providerOptions = new Dictionary();
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index aa0e6ee62248a..17738da515134 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -146,10 +146,6 @@ public void TestSessionOptions()
opt.AppendExecutionProvider_Nnapi(0);
#endif
-#if USE_TVM
- opt.AppendExecutionProvider_Tvm("Vulkan -device=amd_apu");
-#endif
-
#if USE_OPENVINO
opt.AppendExecutionProvider_OpenVINO();
#endif
@@ -179,6 +175,12 @@ public void TestSessionOptions()
ex = Assert.Throws(() => { opt.AppendExecutionProvider("QNN"); });
Assert.Contains("QNN execution provider is not supported in this build", ex.Message);
#endif
+#if USE_COREML
+ opt.AppendExecutionProvider("CoreML");
+#else
+ ex = Assert.Throws(() => { opt.AppendExecutionProvider("CoreML"); });
+ Assert.Contains("CoreML execution provider is not supported in this build", ex.Message);
+#endif
opt.AppendExecutionProvider_CPU(1);
}
@@ -2041,7 +2043,7 @@ public SkipNonPackageTests()
}
// Test hangs on mobile.
-#if !(ANDROID || IOS)
+#if !(ANDROID || IOS)
[Fact(DisplayName = "TestModelRunAsyncTask")]
private async Task TestModelRunAsyncTask()
{
diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda
index ce4560e9b0c7c..40f11dca623a7 100644
--- a/dockerfiles/Dockerfile.cuda
+++ b/dockerfiles/Dockerfile.cuda
@@ -48,7 +48,7 @@ RUN cd /code \
&& python3 -m venv /code/env \
&& . /code/env/bin/activate \
&& pip install --upgrade psutil setuptools wheel packaging \
- && pip install -r tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt \
+ && pip install -r /code/tools/ci_build/github/linux/python/requirements.txt \
&& python /code/tools/ci_build/build.py --build_dir /code/build/Linux \
--allow_running_as_root --skip_submodule_sync \
--use_cuda --cuda_home /usr/local/cuda \
diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index c5d998d503899..876a07e4ffaf6 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -10,7 +10,7 @@ FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
RUN apt-get update &&\
apt-get install -y migraphx
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 39e75a68a369f..d1ebdae3cbdd6 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -11,7 +11,7 @@ FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION} AS builder
ENV WORKDIR_PATH=/home/openvino
WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
ARG DEVICE=CPU
ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
@@ -41,7 +41,7 @@ RUN tar cvf GPL_sources.tar.gz /sources
# Deploy stage
FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION}
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
USER root
COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/dist/*.whl ./
COPY --from=builder /GPL_sources.tar.gz ./
@@ -50,7 +50,7 @@ ARG BUILD_UID=1001
ARG BUILD_USER=onnxruntimedev
RUN adduser --uid $BUILD_UID $BUILD_USER
RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
+ENV WORKDIR_PATH=/home/${BUILD_USER}
WORKDIR ${WORKDIR_PATH}
USER ${BUILD_USER}
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index bef8d7a5f47d2..aca8c3feaff71 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -12,7 +12,7 @@ ARG ONNXRUNTIME_BRANCH=main
WORKDIR /code
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
# Prepare onnxruntime repository & build onnxruntime
RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt
index ef51d41c5ff1b..24947df6308a6 100644
--- a/dockerfiles/Dockerfile.tensorrt
+++ b/dockerfiles/Dockerfile.tensorrt
@@ -17,7 +17,7 @@ RUN apt-get update &&\
RUN unattended-upgrade
WORKDIR /code
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
# Prepare onnxruntime repository & build onnxruntime with TensorRT
RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.vitisai b/dockerfiles/Dockerfile.vitisai
index e11ab70a61332..c6226155e01e3 100644
--- a/dockerfiles/Dockerfile.vitisai
+++ b/dockerfiles/Dockerfile.vitisai
@@ -22,8 +22,8 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:$PATH
-ENV LD_LIBRARY_PATH /opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
WORKDIR /code
RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh &&\
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b87532debe4bc..6ea3f93cdea12 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1596,6 +1596,8 @@ This version of the operator has been available since version 1 of the 'com.micr
(Optional) Hardware architecture.
main_context : int
Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.
+max_size : int
+max size in the context. Usage depend on the EP.
notes : string
(Optional) Some notes for the model
onnx_model_filename : string
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 4d8a286bde66e..199e6671f6a1a 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -21,7 +21,7 @@ This file should be generated. See [cgmanifests/README](/cgmanifests/README.md)
- [onnxruntime/test/python/requirements.txt](/onnxruntime/test/python/requirements.txt)
- [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
- [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt)
-- [tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt](/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt)
+- [tools/ci_build/github/linux/python/requirements.txt](/tools/ci_build/github/linux/python/requirements.txt)
- Run `git grep -rn "onnx==1" .` to find other locations and update this document if necessary.
1. If there is any change to `cmake/external/onnx/onnx/*.in.proto`, you need to regenerate OnnxMl.cs.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index e23a52757dedb..eeb8ebb3ccefe 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -453,6 +453,7 @@ Do not modify directly.*
|SVMClassifier|*in* X:**T1**]
*out* Y:**T2**
*out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)
**T2** = tensor(int64), tensor(string)|
|SVMRegressor|*in* X:**T**
*out* Y:**tensor(float)**|1+|**T** = tensor(float)|
|Scaler|*in* X:**T**
*out* Y:**tensor(float)**|1+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
+|TreeEnsemble|*in* X:**T**
*out* Y:**T**|5+|**T** = tensor(double), tensor(float)|
|TreeEnsembleClassifier|*in* X:**T1**
*out* Y:**T2**
*out* Z:**tensor(float)**|3+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)
**T2** = tensor(int64), tensor(string)|
|||[1, 2]|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)
**T2** = tensor(int64), tensor(string)|
|TreeEnsembleRegressor|*in* X:**T**
*out* Y:**tensor(float)**|3+|**T** = tensor(double), tensor(float)|
@@ -1086,11 +1087,13 @@ Do not modify directly.*
|GreaterOrEqual|*in* A:**T**
*in* B:**T**
*out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(bool)|
|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(bool)|
|GridSample|*in* X:**T1**
*in* grid:**T2**
*out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T2** = tensor(float), tensor(float16)|
+|GroupNorm||21+|**M** = tensor(float), tensor(float16)
**T** = tensor(float), tensor(float16)|
|HardSigmoid|*in* X:**T**
*out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
|Hardmax|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
|||11+|**T** = tensor(float), tensor(float16)|
|||1+|**T** = tensor(float), tensor(float16)|
-|Identity|*in* input:**T**
*out* output:**T**
or
*in* input:**V**
*out* output:**V**|19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T**
*out* output:**T**
or
*in* input:**V**
*out* output:**V**|21+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1189,7 +1192,8 @@ Do not modify directly.*
|||12+|**T** = tensor(float), tensor(float16), tensor(int32)
**T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
|||7+|**T** = tensor(float), tensor(float16)|
|QLinearConv|*in* x:**T1**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T1**
*in* w:**T2**
*in* w_scale:**tensor(float)**
*in* w_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*in* B:**T4**
*out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)
**T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1**
*in* a_scale:**TS**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**TS**
*in* b_zero_point:**T2**
*in* y_scale:**TS**
*in* y_zero_point:**T3**
*out* y:**T3**
or
*in* a:**T1**
*in* a_scale:**tensor(float)**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**tensor(float)**
*in* b_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1**
*in* a_scale:**TS**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**TS**
*in* b_zero_point:**T2**
*in* y_scale:**TS**
*in* y_zero_point:**T3**
*out* y:**T3**
or
*in* a:**T1**
*in* a_scale:**tensor(float)**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**tensor(float)**
*in* b_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*out* y:**T3**|21+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)|
+|||10+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)|
|QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**
or
*in* x:**T1**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T2**
*out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)
**T2** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
|||19+|**T1** = tensor(float), tensor(float16), tensor(int32)
**T2** = tensor(int8), tensor(uint8)|
|||13+|**T1** = tensor(float), tensor(int32)
**T2** = tensor(int8), tensor(uint8)|
diff --git a/docs/TVM_EP.md b/docs/TVM_EP.md
deleted file mode 100644
index df59d5c05855c..0000000000000
--- a/docs/TVM_EP.md
+++ /dev/null
@@ -1,319 +0,0 @@
-# TVM Execution Provider
-
-## Contents
-
-- [Introduction](#introduction)
-- [Build](#build-onnx-runtime-with-the-tvm-execution-provider)
- - [Linux](#linux)
- - [Windows](#windows)
-- [Configuration options](#configuration-options)
-- [Performance Tuning](#performance-tuning)
- - [Using precompiled model](#using-precompiled-model)
-- [Samples](#samples)
-- [Known issues](#known-issues)
-
-
-## Introduction
-
-TVM is an execution provider for ONNX Runtime that is built on top of Apache TVM. It enables ONNX Runtime users to leverage Apache TVM model optimizations.
-TVM EP is currently in "Preview". It's been tested to work on a handful of models on Linux or Windows, but not on MacOS.
-
-## Build ONNX Runtime with the TVM Execution Provider
-
-### **Linux**
-Install the minimal pre-requisites on Ubuntu/Debian like linux operating systems:
-```bash
-apt-get install -y python3 python3-dev python3-pip python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev llvm-12
-pip3 install numpy decorator attrs nasm
-```
-Note: since ONNX Runtime with TVM EP is built with Intel ipp-crypto library there are new requirements. Compiler gcc (and g++) version should be equal to or higher than 8.2. nasm version should be 2.14.02 or higher. Problem with small nasm version can be seen [here](https://github.com/intel/ipp-crypto/issues/9) or [here](https://bugzilla.nasm.us/show_bug.cgi?id=3392205). For ubuntu LTS 18 `apt-get install nasm` is not enough due to it has version 2.13.02, see how to install from sources instruction [here](https://stackoverflow.com/questions/36144930/steps-to-install-nasm-offline-on-ubuntu).
-
-Also, the current implementation has `NVidia GPU` support for TVM EP. For now, you can use only `NVidia GPU` with CUDA Toolkit support.
-To do this, make sure you have installed the NVidia driver and CUDA Toolkit.
-More detailed instructions can be found on the [official page](https://developer.nvidia.com/cuda-toolkit).
-
-Clone this repo.
-In order to build ONNXRT you will need to have CMake 3.18 or higher. In Ubuntu 20.04 you can use the following commands to install the latest version of CMake:
-
-```bash
-sudo apt-get update
-sudo apt-get install gpg wget
-
-wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-
-echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null
-sudo apt-get update
-
-sudo rm /usr/share/keyrings/kitware-archive-keyring.gpg
-sudo apt-get install kitware-archive-keyring
-
-sudo apt-get install cmake
-```
-
-Build ONNX Runtime (TVM x86):
-```bash
-./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm
-```
-
-Build ONNX Runtime (TVM with CUDA support):
-```bash
-./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm --tvm_cuda_runtime
-```
-
-This command builds both `TVM` and `onnxruntime-tvm`. It creates two wheel, one for each project.
-Build the python API for ONNX Runtime instead of using the standard package. Instructions for this are given below.
-
-Package for TVM:
-```bash
-cd
-python3 -m pip uninstall tvm -y
-whl_path=$(find ./build//Release/_deps/tvm-src/python/dist -name "*.whl")
-python3 -m pip install $whl_path
-```
-
-Package for TVM EP:
-```bash
-cd
-python3 -m pip uninstall onnxruntime onnxruntime-tvm -y
-whl_path=$(find ./build//Release/dist -name "*.whl")
-python3 -m pip install $whl_path
-```
-
-Alternatively, you can set `PYTHONPATH` to tell python where to find the ONNXRT library and the TVM library.
-```bash
-export PYTHONPATH=/build//Release:${PYTHONPATH}
-export PYTHONPATH=/build//Release/_deps/tvm-src/python:${PYTHONPATH}
-```
-
-### **Windows**
-Install the minimal prerequisites on Windows: Git, CMake, Visual Studio, Python, LLVM
-- Git: Download Git for Windows from [here](https://git-scm.com/download/win) and install it. Please make sure that the git.exe path is included in the environment variable. By default, it should be added. To check git after the installation use `git --version` in command line (cmd).
-- CMake: use [the link](https://cmake.org/download/) to download and install CMake. msi-file is recommended for it. To verify CMake installation use `cmake --version` in cmd.
-- Visual Studio: Download from [here](https://visualstudio.microsoft.com/ru/downloads/) and install Visual Studio 20** Community & Visual Studio Build Tools respectively. It is recommended not to change the default installation path. Chose "Desktop development with C++" workload and make sure that both options of “MSVC [contemporary version] C++ build tools” and “Windows 10 SDK” are selected.
-- Python: Download Python 3.* from [here](https://www.python.org/downloads/windows/) and install it. Please have a check on the option of “Add Python to PATH”, so the installer will include the Python directory into the environment variable directly. To check python after the installation use `python` from cmd. The expected output is similar to the following:
-```cmd
-Python 3.10.5 (tags/v3.10.5:f377153, Jun 6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)] on win32
-Type "help", "copyright", "credits" or "license" for more information.
->>>
-```
-Use `quit()` to exit from python interface.
-- LLVM: the compiler is not necessary for pure ONNX Runtime installation but it is needed for TVM EP by default.
-```cmd
-git clone --depth 1 --branch release/11.x https://github.com/llvm/llvm-project.git
-cmake -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi" -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64 -DCMAKE_BUILD_TYPE=Release -G "Visual Studio 17 2022"
-cmake --build ./build --config Release
-```
-- Dependencies of ipp-crypto:
-1. install asm compiler (nasm) on windows by line:
-```cmd
-winget install nasm -i
-```
-
-Add it to PATH (instruction for Windows GUI can be seen [here](https://www.computerhope.com/issues/ch000549.htm#dospath)) or by cmd:
-```cmd
-set PATH="%PATH%;C:\Program Files\NASM"
-```
-
-or
-```cmd
-setx PATH "%PATH%;C:\Program Files\NASM"
-```
-
-Check by `nasm --version` in prompt command line.
-
-2. install openssl on windows by msi-file from [here](https://slproweb.com/products/Win32OpenSSL.html)
-Add path to directory (e.g. "C:\Program Files\OpenSSL-Win64\bin") with executable file to PATH (see instructions above).
-
-Check by `openssl version` in prompt command line.
-
-3. Correct build of ipp-crytpo requires specific environment variables for supported MSVC compiler. Long way to adjust the environment is to follow to instructions [here](https://docs.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=msvc-170&viewFallbackFrom=vs-2017). Quick way is to use VS Developer command prompt where the environment have been already adjusted or add some paths to standard Windows command prompt:
-```cmd
-set INCLUDE=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\include;C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt
-```
-
-Take into account that MSVC and Kit versions are specific for Visual Studio built on the machine, specified values here are used as example.
-
-
-
-For using NVIDIA GPU (optional) CUDA and cuDNN should be installed.
-- CUDA: Install CUDA by the [link](https://developer.nvidia.com/cuda-11.0-download-archive).
-- cuDNN: download cuDNN installer from [here](https://developer.nvidia.com/rdp/cudnn-archive). Choose v8.* for corresponding CUDA v11.*, unzip it, and move cuDNN files as following:
-1. [unzipped dir]\bin\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin
-2. [unzipped dir]\include\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include
-3. [unzipped dir]\lib\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib
-
-To verify the CUDA installation use `nvcc --version` in cmd.
-
-
-
-#### **Build ONNX Runtime with TVM Execution Provider from source (Python):**
-- Use command line and clone sources from github:
-```cmd
-git clone --recursive https://github.com/Microsoft/onnxruntime
-cd onnxruntime
-```
-- CPU build:
-```
-build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config /build/Release/bin/llvm-config.exe
-```
-- GPU build:
-```
-build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config /build/Release/bin/llvm-config.exe --use_cuda --cudnn_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*” --cuda_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*”
-```
-In both cases (CPU, GPU) there are the following options for cmake generator: "Visual Studio 17 2022" and "Ninja". Also handshake mechanism can be switched on by `--use_tvm_hash` flag. At the latter case ipp-crypto library is built with dependencies, see details above.
-- Install python wheel package for ONNX Runtime:
-Default path to the package is `/build/Windows/Release/Release/dist`. Note that it is different in comparison with path to the package on Linux. Before installation check names of wheel packages and use corresponding one. It can be looked like the following:
-```cmd
-python -m pip install .\onnxruntime\build\Windows\Release\Release\dist\onnxruntime_tvm-1.6.0-cp38-cp38-win_amd64.whl
-```
-- Install python wheel package for TVM due to its python API is used inside TVM EP:
-It can be looked like the following:
-```cmd
-python -m pip install .\onnxruntime\build\Windows\Release\_deps\tvm-src\python\dist\tvm-0.9.dev1728+g3425ed846-cp39-cp39-win_amd64.whl
-```
-- Verify result by python script. Note: python should not be launched from directory containing 'onnxruntime' directory for correct result:
-```python
-import onnxruntime
-print(onnxruntime.__version__)
-print(onnxruntime.get_device())
-print(onnxruntime.get_available_providers())
-```
-- Uninstall procedure:
-```cmd
-pip uninstall onnxruntime-tvm
-```
-
-#### **Build ONNX Runtime with TVM Execution Provider from source (C#):**
-- Use command line and clone sources from github:
-```cmd
-git clone --recursive https://github.com/Microsoft/onnxruntime
-cd onnxruntime
-```
-- CPU build:
-
-Make sure you download [nuget.exe](https://docs.microsoft.com/en-us/nuget/install-nuget-client-tools#nugetexe-cli) and add path to it into `PATH` env.
-```
-build.bat --config Release --build_nuget --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config llvm-config.exe
-```
-- Install C# nuget package for TVM EP. Default path to the package is `\build\Windows\Release\Release`.
-
-
-## Configuration options
-TVM Executor Provider can be configured with the following provider options:
-1. Python
-```python
-po = [dict(executor=tvm_executor_type,
- so_folder=folder_with_pretuned_files,
- check_hash=check_hash,
- hash_file_path=hash_file_path,
- target=client_target,
- target_host=client_target_host,
- opt_level=client_opt_level,
- freeze_weights=freeze,
- to_nhwc=layout_transform,
- tuning_type=tvm_optimizer_type,
- tuning_file_path=client_tuning_logfile,
- input_names = input_names_str,
- input_shapes = input_shapes_str)]
-tvm_session = onnxruntime.InferenceSession(model_path, providers=["TvmExecutionProvider"], provider_options=po)
-```
-
-2. C#
-
-Currently, only precompiled models are supported in C# (see the related section below).
-
-```CSharp
-SessionOptions session_options = new SessionOptions{};
-string tvm_ep_options =
- $"executor: {tvm_executor_type}, " +
- $"so_folder: {folder_with_pretuned_files}, " +
- $"check_hash: {check_hash}, " +
- $"hash_file_path: {hash_file_path}, " +
- $"target: {client_target}, " +
- $"target_host: {client_target_host}, " +
- $"opt_level: {client_opt_level}, " +
- $"freeze_weights: {freeze}, " +
- $"to_nhwc: {layout_transform}, " +
- $"tuning_type: {tvm_optimizer_type}, " +
- $"tuning_file_path: {client_tuning_logfile}, " +
- $"input_names: {input_names_str}, " +
- $"input_shapes: {input_shapes_str}";
-
-session_options.AppendExecutionProvider_Tvm(tvm_ep_options);
-using var tvm_session = new InferenceSession(modelFilePath, session_options);
-```
-
-
-- `executor` is executor type used by TVM. There is choice between two types: GraphExecutor and VirtualMachine which are corresponded to "graph" and "vm" tags. VirtualMachine is used by default.
-- `so_folder` is path to folder with set of files (.ro-, .so/.dll-files and weights) obtained after model tuning. It uses these files for executor compilation instead of onnx-model. But the latter is still needed for ONNX Runtime.
-- `check_hash` means that it is necessary to perform a HASH check for the model obtained in the `so_folder` parameter. It is `False` by default.
-- `hash_file_path` is path to file that contains the pre-computed HASH for the ONNX model which result of tuning locates in the path passed by `so_folder` parameter.
- If an empty string was passed as this value, then the file will be searched in the folder that was passed in the `so_folder` parameter.
-- `target` and `target_host` are strings like in TVM (e.g. "llvm --mcpu=avx2"). When using accelerators, target may be something like `cuda` while target_host may be `llvm -mtriple=x86_64-linux-gnu`
-- `opt_level` is TVM optimization level. It is 3 by default
-- `freeze_weights` means that all model weights are kept on compilation stage otherwise they are downloaded each inference. True is recommended value for the best performance. It is true by default.
-- `to_nhwc` switches on special model transformations, particularly data layout, which Octomizer is used. It allows to work correctly with tuning logs obtained from Octomizer. It is false by default.
-- `tuning_type` defines the type of TVM tuning logs being used, and can be set to either `AutoTVM` (1st gen auto tuning logs) or `Ansor` (2nd gen auto tuning logs). By default this option is set to `AutoTVM`.
-- `tuning_file_path` is path to AutoTVM or Ansor tuning file which gives specifications for given model and target for the best performance. (See below for more details).
-
-TVM supports models with fixed graph only. If your model has unknown dimensions in input shapes (excluding batch size) you must provide the shape using the `input_names` and `input_shapes` provider options. Below is an example of what must be passed to `provider_options`:
-```python
-input_names = "input_1 input_2"
-input_shapes = "[1 3 224 224] [1 2]"
-```
-
-## Performance Tuning
-TVM optimizes machine learning models through an automated tuning process that produces model variants specific to targeted hardware architectures. This process also generates 'tuning logs' that the TVM EP relies on to maximize model performance. These logs can be acquired for your model by either using TVM as described here:
-
-AutoTVM:
-https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html
-
-Ansor (Autoscheduling):
-https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/index.html
-
-or by using logs generated through the OctoML platform (https://onnx.octoml.ai) using instructions [here](https://help.octoml.ai/en/articles/5814452-using-octoml-platform-logs-with-onnx-rt-tvm-ep)
-
-Using the TVM EP with TVM tuning logs also requires users to turn off ONNX Runtime preprocessing. To do this, the following `SessionOptions()` can be used:
-```
-so = onnxruntime.SessionOptions()
-so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
-
-tvm_session = onnxruntime.InferenceSession(model_path, sess_options=so, providers=["TvmExecutionProvider"], provider_options=po)
-```
-
-### **Using precompiled model**
-It is also possible to use a precompiled model.
-
-The compiled model can be obtained using the [OctoML platform](https://onnx.octoml.ai)
-or compiled directly (see **Support precompiled model** section in
-[Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb)
-for more information on model compilation).
-
-In order to use the precompiled model, only need to pass two options:
-* **executor** - `vm` (`VirtualMachine`) must be used as a value
-(this functionality is not supported for `GraphExecutor`);
-* **so_folder** - as a value, you must pass the path to the directory where
-the files of the precompiled model are located.
-* **check_hash** - (optional) if you want to check hash, you must pass `True` as the value.
-* **hash_file_path** - (optional) by default, the file containing the hash for the tuned model will be searched in the directory that is passed in the `so_folder` parameter.
- If you want to specify different location, then you must pass the path to the file that contains the desired hash as a value.
-
-You can read more about these options in section [Configuration options](#configuration-options) above.
-
-
-## Samples
-- [Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb)
-
-## Known issues
-- At this moment, the TVM EP has only been verified on UNIX/Linux and Windows systems.
-- Some compatibility issues have been found between ONNX and Google protobuf. `AttributeError: module 'google.protobuf.internal.containers' has no attribute 'MutableMapping'`. This usually occurss during `import onnx` in any python scripts for protobuf version >= 3.19.0 and ONNX version <= 1.8.1. To resolve the issue Google protobuf and ONNX can be reinstalled separately or together using:
-```
-pip3 uninstall onnx -y
-pip3 install onnx==1.10.1
-pip3 uninstall protobuf -y
-pip3 install protobuf==3.19.1
-```
-
-The following pair of ONNX and protobuf versions have been found to be compatible:
-- 3.17.3 and 1.8.0
-- 3.19.1 and 1.10.1
diff --git a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb b/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb
deleted file mode 100644
index 830495bdfb98d..0000000000000
--- a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb
+++ /dev/null
@@ -1,657 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "72476497",
- "metadata": {},
- "source": [
- "# ONNX Runtime: Tutorial for TVM execution provider\n",
- "\n",
- "This notebook shows a simple example for model inference with TVM EP.\n",
- "\n",
- "\n",
- "#### Tutorial Roadmap:\n",
- "1. Prerequistes\n",
- "2. Accuracy check for TVM EP\n",
- "3. Configuration options\n",
- "4. Support precompiled model"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9345cbab",
- "metadata": {},
- "source": [
- "## 1. Prerequistes\n",
- "\n",
- "Make sure that you have installed all the necessary dependencies described in the corresponding paragraph of the documentation.\n",
- "\n",
- "Also, make sure you have the `tvm` and `onnxruntime-tvm` packages in your pip environment. \n",
- "\n",
- "If you are using `PYTHONPATH` variable expansion, make sure it contains the following paths: `/onnxruntime/cmake/external/tvm_update/python` and `/onnxruntime/build/Linux/Release`."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "da4ca21f",
- "metadata": {},
- "source": [
- "### Common import\n",
- "\n",
- "These packages can be delivered from standard `pip`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "0f072875",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import onnx\n",
- "import tempfile\n",
- "import numpy as np\n",
- "from typing import List, AnyStr\n",
- "from onnx import ModelProto, helper, checker, mapping"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "118670aa",
- "metadata": {},
- "source": [
- "### Specialized import\n",
- "\n",
- "It is better to collect these packages from source code in order to clearly understand what is available to you right now."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "a5502966",
- "metadata": {},
- "outputs": [],
- "source": [
- "import onnxruntime\n",
- "\n",
- "import tvm\n",
- "import tvm.relay\n",
- "import tvm.testing\n",
- "import tvm.runtime\n",
- "import tvm.runtime.vm\n",
- "import tvm.relay.backend.vm\n",
- "import tvm.contrib.download"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b7313183",
- "metadata": {},
- "source": [
- "### Helper functions for working with ONNX ModelProto\n",
- "\n",
- "This set of helper functions allows you to recognize the meta information of the models. This information is needed for more versatile processing of ONNX models."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "7d0a36e8",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_onnx_input_names(model: ModelProto) -> List[AnyStr]:\n",
- " inputs = [node.name for node in model.graph.input]\n",
- " initializer = [node.name for node in model.graph.initializer]\n",
- " inputs = list(set(inputs) - set(initializer))\n",
- " return sorted(inputs)\n",
- "\n",
- "\n",
- "def get_onnx_output_names(model: ModelProto) -> List[AnyStr]:\n",
- " return [node.name for node in model.graph.output]\n",
- "\n",
- "\n",
- "def get_onnx_input_types(model: ModelProto) -> List[np.dtype]:\n",
- " input_names = get_onnx_input_names(model)\n",
- " return [\n",
- " mapping.TENSOR_TYPE_TO_NP_TYPE[node.type.tensor_type.elem_type]\n",
- " for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n",
- " ]\n",
- "\n",
- "\n",
- "def get_onnx_input_shapes(model: ModelProto) -> List[List[int]]:\n",
- " input_names = get_onnx_input_names(model)\n",
- " return [\n",
- " [dv.dim_value for dv in node.type.tensor_type.shape.dim]\n",
- " for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n",
- " ]\n",
- "\n",
- "\n",
- "def get_random_model_inputs(model: ModelProto) -> List[np.ndarray]:\n",
- " input_shapes = get_onnx_input_shapes(model)\n",
- " input_types = get_onnx_input_types(model)\n",
- " assert len(input_types) == len(input_shapes)\n",
- " inputs = [np.random.uniform(size=shape).astype(dtype) for shape, dtype in zip(input_shapes, input_types)]\n",
- " return inputs"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f0de1682",
- "metadata": {},
- "source": [
- "### Wrapper helper functions for Inference\n",
- "\n",
- "Wrapper helper functions for running model inference using ONNX Runtime EP."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "258ce9e9",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_onnxruntime_output(model: ModelProto, inputs: List, provider_name: AnyStr) -> np.ndarray:\n",
- " output_names = get_onnx_output_names(model)\n",
- " input_names = get_onnx_input_names(model)\n",
- " assert len(input_names) == len(inputs)\n",
- " input_dict = {input_name: input_value for input_name, input_value in zip(input_names, inputs)}\n",
- "\n",
- " inference_session = onnxruntime.InferenceSession(model.SerializeToString(), providers=[provider_name])\n",
- " output = inference_session.run(output_names, input_dict)\n",
- "\n",
- " # Unpack output if there's only a single value.\n",
- " if len(output) == 1:\n",
- " output = output[0]\n",
- " return output\n",
- "\n",
- "\n",
- "def get_cpu_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n",
- " return get_onnxruntime_output(model, inputs, \"CPUExecutionProvider\")\n",
- "\n",
- "\n",
- "def get_tvm_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n",
- " return get_onnxruntime_output(model, inputs, \"TvmExecutionProvider\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cc17d3b2",
- "metadata": {},
- "source": [
- "### Helper function for checking accuracy\n",
- "\n",
- "This function uses the TVM API to compare two output tensors. The tensor obtained using the `CPUExecutionProvider` is used as a reference.\n",
- "\n",
- "If a mismatch is found between tensors, an appropriate exception will be thrown."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "4e598907",
- "metadata": {},
- "outputs": [],
- "source": [
- "def verify_outputs(\n",
- " lhs: List[np.ndarray],\n",
- " rhs: List[np.ndarray],\n",
- " rtol: float = 5e-5,\n",
- " atol: float = 5e-5\n",
- ") -> None:\n",
- " for lhs_tensor, rhs_tensor in zip(lhs, rhs):\n",
- " tvm.testing.assert_allclose(lhs_tensor, rhs_tensor, rtol=rtol, atol=atol)\n",
- " assert lhs_tensor.dtype == rhs_tensor.dtype\n",
- " print(\"Same output, congratulations!\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "f33a372b",
- "metadata": {},
- "outputs": [],
- "source": [
- "def verify_with_ort_with_inputs(\n",
- " model,\n",
- " inputs,\n",
- " out_shape=None,\n",
- " opset=None,\n",
- " freeze_params=False,\n",
- " dtype=\"float32\",\n",
- " rtol=1e-5,\n",
- " atol=1e-5,\n",
- " opt_level=1,\n",
- "):\n",
- " if opset is not None:\n",
- " model.opset_import[0].version = opset\n",
- "\n",
- " ort_out = get_cpu_onnxruntime_output(model, inputs)\n",
- " tvm_out = get_tvm_onnxruntime_output(model, inputs)\n",
- " verify_outputs(ort_out, tvm_out, rtol, atol)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8c62b01a",
- "metadata": {},
- "source": [
- "### Helper functions for download models\n",
- "\n",
- "These functions use the TVM API to download models from the ONNX Model Zoo."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "324c00e7",
- "metadata": {},
- "outputs": [],
- "source": [
- "BASE_MODEL_URL = \"https://github.com/onnx/models/raw/master/\"\n",
- "MODEL_URL_COLLECTION = {\n",
- " \"ResNet50-v1\": \"vision/classification/resnet/model/resnet50-v1-7.onnx\",\n",
- " \"ResNet50-v2\": \"vision/classification/resnet/model/resnet50-v2-7.onnx\",\n",
- " \"SqueezeNet-v1.1\": \"vision/classification/squeezenet/model/squeezenet1.1-7.onnx\",\n",
- " \"SqueezeNet-v1.0\": \"vision/classification/squeezenet/model/squeezenet1.0-7.onnx\",\n",
- " \"Inception-v1\": \"vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx\",\n",
- " \"Inception-v2\": \"vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx\",\n",
- "}\n",
- "\n",
- "\n",
- "def get_model_url(model_name):\n",
- " return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]\n",
- "\n",
- "\n",
- "def get_name_from_url(url):\n",
- " return url[url.rfind(\"/\") + 1 :].strip()\n",
- "\n",
- "\n",
- "def find_of_download(model_name):\n",
- " model_url = get_model_url(model_name)\n",
- " model_file_name = get_name_from_url(model_url)\n",
- " return tvm.contrib.download.download_testdata(model_url, model_file_name, module=\"models\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "90fb7c5c",
- "metadata": {},
- "source": [
- "## 2. Accuracy check for TVM EP \n",
- "\n",
- "This section will check the accuracy. The check will be to compare the output tensors for `CPUExecutionProvider` and `TvmExecutionProvider`. See the description of `verify_with_ort_with_inputs` function used above.\n",
- "\n",
- "\n",
- "### Check for simple architectures"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "c739ed5c",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_two_input_model(op_name: AnyStr) -> ModelProto:\n",
- " dtype = \"float32\"\n",
- " in_shape = [1, 2, 3, 3]\n",
- " in_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]\n",
- " out_shape = in_shape\n",
- " out_type = in_type\n",
- "\n",
- " layer = helper.make_node(op_name, [\"in1\", \"in2\"], [\"out\"])\n",
- " graph = helper.make_graph(\n",
- " [layer],\n",
- " \"two_input_test\",\n",
- " inputs=[\n",
- " helper.make_tensor_value_info(\"in1\", in_type, in_shape),\n",
- " helper.make_tensor_value_info(\"in2\", in_type, in_shape),\n",
- " ],\n",
- " outputs=[\n",
- " helper.make_tensor_value_info(\n",
- " \"out\", out_type, out_shape\n",
- " )\n",
- " ],\n",
- " )\n",
- " model = helper.make_model(graph, producer_name=\"two_input_test\")\n",
- " checker.check_model(model, full_check=True)\n",
- " return model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "7048ee6d",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Same output, congratulations!\n",
- "****************** Success! ******************\n"
- ]
- }
- ],
- "source": [
- "onnx_model = get_two_input_model(\"Add\")\n",
- "inputs = get_random_model_inputs(onnx_model)\n",
- "verify_with_ort_with_inputs(onnx_model, inputs)\n",
- "print(\"****************** Success! ******************\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "52c880f4",
- "metadata": {},
- "source": [
- "### Check for DNN architectures "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "f5d465dc",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_onnx_model(model_name):\n",
- " model_path = find_of_download(model_name)\n",
- " onnx_model = onnx.load(model_path)\n",
- " return onnx_model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "68daac7e",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Same output, congratulations!\n",
- "****************** Success! ******************\n"
- ]
- }
- ],
- "source": [
- "model_name = \"ResNet50-v1\"\n",
- "\n",
- "onnx_model = get_onnx_model(model_name)\n",
- "inputs = get_random_model_inputs(onnx_model)\n",
- "verify_with_ort_with_inputs(onnx_model, inputs)\n",
- "print(\"****************** Success! ******************\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e27f64a2",
- "metadata": {},
- "source": [
- "## 3. Configuration options\n",
- "\n",
- "This section shows how you can configure TVM EP using custom options. For more details on the options used, see the corresponding section of the documentation."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "a053f59f",
- "metadata": {},
- "outputs": [],
- "source": [
- "provider_name = \"TvmExecutionProvider\"\n",
- "provider_options = dict(\n",
- " target=\"llvm -mtriple=x86_64-linux-gnu\",\n",
- " target_host=\"llvm -mtriple=x86_64-linux-gnu\",\n",
- " opt_level=3,\n",
- " freeze_weights=True,\n",
- " tuning_file_path=\"\",\n",
- " tuning_type=\"Ansor\",\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "3f6e6f01",
- "metadata": {},
- "outputs": [],
- "source": [
- "model_name = \"ResNet50-v1\"\n",
- "onnx_model = get_onnx_model(model_name)\n",
- "input_dict = {\n",
- " input_name: input_value for input_name, input_value in zip(\n",
- " get_onnx_input_names(onnx_model),\n",
- " get_random_model_inputs(onnx_model),\n",
- " )\n",
- "}\n",
- "output_names = get_onnx_output_names(onnx_model)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "85ab83f2",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "****************** Output shape: (1, 1000) ******************\n"
- ]
- }
- ],
- "source": [
- "tvm_session = onnxruntime.InferenceSession(\n",
- " onnx_model.SerializeToString(),\n",
- " providers=[provider_name],\n",
- " provider_options=[provider_options],\n",
- ")\n",
- "output = tvm_session.run(output_names, input_dict)[0]\n",
- "print(f\"****************** Output shape: {output.shape} ******************\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b704374b",
- "metadata": {},
- "source": [
- "## 4. Support precompiled model\n",
- "\n",
- "Wrapper functions that allow you to compile the model and save it in the desired format."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "8150942b",
- "metadata": {},
- "outputs": [],
- "source": [
- "def compile_virtual_machine(model: onnx.ModelProto, target_str: AnyStr) -> tvm.runtime.vm.Executable:\n",
- " ir_mod, params = tvm.relay.frontend.from_onnx(\n",
- " model,\n",
- " opset=model.opset_import[0].version,\n",
- " freeze_params=True,\n",
- " )\n",
- " target = tvm.target.Target(target=target_str, host=target_str)\n",
- " return tvm.relay.backend.vm.compile(ir_mod, target)\n",
- "\n",
- "\n",
- "def serialize_virtual_machine(vm_exec: tvm.runtime.vm.Executable) -> AnyStr:\n",
- " temp_directory = tempfile.mkdtemp()\n",
- " path_consts = os.path.join(temp_directory, \"consts\")\n",
- " vm_exec.move_late_bound_consts(path_consts, byte_limit=256)\n",
- " lib_path = os.path.join(temp_directory, f\"model.so\")\n",
- " code_path = os.path.join(temp_directory, f\"model.ro\")\n",
- " code, lib = vm_exec.save()\n",
- " lib.export_library(lib_path)\n",
- " with open(code_path, \"wb\") as fo:\n",
- " fo.write(code)\n",
- " return temp_directory"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9cbb987e",
- "metadata": {},
- "source": [
- "Preparation of the ONNX model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "febb9d72",
- "metadata": {},
- "outputs": [],
- "source": [
- "model_name = \"ResNet50-v1\"\n",
- "onnx_model = get_onnx_model(model_name)\n",
- "input_dict = {\n",
- " input_name: input_value for input_name, input_value in zip(\n",
- " get_onnx_input_names(onnx_model),\n",
- " get_random_model_inputs(onnx_model),\n",
- " )\n",
- "}\n",
- "output_names = get_onnx_output_names(onnx_model)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b05b251a",
- "metadata": {},
- "source": [
- "Compiling the ONNX model using `VirtualMachine` (TVM)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "b4b999ee",
- "metadata": {},
- "outputs": [],
- "source": [
- "compiled_vm_exec = compile_virtual_machine(onnx_model, target_str=\"llvm\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "e3408c15",
- "metadata": {},
- "outputs": [],
- "source": [
- "so_folder = serialize_virtual_machine(compiled_vm_exec)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "311405e8",
- "metadata": {},
- "source": [
- "Preparing `ProviderOptions` and launching `TVM EP` inference.\n",
- "\n",
- "In order to use the precompiled model, you only need to pass two options:\n",
- "* **executor** - `vm` (`VirtualMachine`) must be used as a value (this functionality is not supported for `GraphExecutor`);\n",
- "* **so_folder** - as a value, you must pass the path to the directory where the files of the precompiled model are located."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "8927293c",
- "metadata": {},
- "outputs": [],
- "source": [
- "provider_name = \"TvmExecutionProvider\"\n",
- "provider_options = dict(\n",
- " executor=\"vm\",\n",
- " so_folder=so_folder,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "d7532863",
- "metadata": {},
- "outputs": [],
- "source": [
- "tvm_session = onnxruntime.InferenceSession(\n",
- " onnx_model.SerializeToString(),\n",
- " providers=[provider_name],\n",
- " provider_options=[provider_options],\n",
- ")\n",
- "tvm_output = tvm_session.run(output_names, input_dict)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1c0b983e",
- "metadata": {},
- "source": [
- "Let's make sure that the output values match those that can be obtained through `CPUExecutionProvider`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "c3de2299",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Same output, congratulations!\n"
- ]
- }
- ],
- "source": [
- "verify_outputs(\n",
- " tvm_output[0],\n",
- " get_cpu_onnxruntime_output(\n",
- " onnx_model,\n",
- " input_dict.values()\n",
- " ),\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h
index 7b3d04ee66d9e..aaf533135429c 100644
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@@ -8,6 +8,9 @@
#include "core/framework/op_kernel.h"
namespace onnxruntime {
+namespace logging {
+class Logger;
+}
using KernelCreateMap = std::multimap;
using KernelDefHashes = std::vector>;
@@ -33,6 +36,7 @@ class KernelRegistry {
// Kernel matching uses the types from the node and the kernel_type_str_resolver.
Status TryFindKernel(const Node& node, ProviderType exec_provider,
const IKernelTypeStrResolver& kernel_type_str_resolver,
+ const logging::Logger& logger,
const KernelCreateInfo** out) const;
// map of type constraint name to required type
@@ -42,6 +46,7 @@ class KernelRegistry {
// Kernel matching uses the explicit type constraint name to required type map in type_constraints.
Status TryFindKernel(const Node& node, ProviderType exec_provider,
const TypeConstraintMap& type_constraints,
+ const logging::Logger& logger,
const KernelCreateInfo** out) const;
/**
@@ -61,13 +66,15 @@ class KernelRegistry {
std::string_view domain,
int version,
const KernelRegistry::TypeConstraintMap& type_constraints,
+ const logging::Logger& logger,
const KernelCreateInfo** out) const;
static bool HasImplementationOf(const KernelRegistry& r, const Node& node,
ProviderType exec_provider,
- const IKernelTypeStrResolver& kernel_type_str_resolver) {
+ const IKernelTypeStrResolver& kernel_type_str_resolver,
+ const logging::Logger& logger) {
const KernelCreateInfo* info;
- Status st = r.TryFindKernel(node, exec_provider, kernel_type_str_resolver, &info);
+ Status st = r.TryFindKernel(node, exec_provider, kernel_type_str_resolver, logger, &info);
return st.IsOK();
}
@@ -83,6 +90,7 @@ class KernelRegistry {
Status TryFindKernelImpl(const Node& node, ProviderType exec_provider,
const IKernelTypeStrResolver* kernel_type_str_resolver,
const TypeConstraintMap* type_constraints,
+ const logging::Logger& logger,
const KernelCreateInfo** out) const;
// Check whether the types of inputs/outputs of the given node match the extra
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index a17da2a19bb99..07625c38d8474 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -79,7 +79,6 @@ class OpKernel {
// the allocator tied to the session if the kernel owns the pre-packed buffer or an
// allocator shared between sessions if the pre-packed buffer is to be shared across sessions
// (i.e.) the kernel does not own the buffer.
- // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
// @param is_packed: Set it to true if the kernel packed the tensor or to false
// The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
// and the original initialized constant tensor will be released and not accessible anymore in
@@ -89,7 +88,6 @@ class OpKernel {
virtual Status
PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
- bool, /*save_prepacked_initializers*/
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
is_packed = false;
return Status::OK();
@@ -131,26 +129,6 @@ class OpKernel {
return Status::OK();
}
- // Override this function to get pre-packed tensors from this kernel.
- // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
- // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
- // @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
- // Please refer to matmul_nbits kernel for a complete example.
- virtual std::optional GetPrePackTensor(int /*input_idx*/) {
- return std::nullopt;
- }
-
- // Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
- // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
- // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
- // Please refer to matmul_nbits kernel for a complete example.
- // @param input_idx : The input index of the tensor in this kernel.
- // @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
- // to restore prepacked weight buffer.
- virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
- return Status::OK();
- }
-
const OrtDevice GetDevice(OrtMemType mem_type) const;
const OpKernelInfo& Info() const {
return *op_kernel_info_;
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 69af3c93d7a07..eb9581e8018d1 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1148,11 +1148,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
#endif
- // Since one constant initializer could be used by different kernels
- // and prepacked differently, use an unordered_map to store prepacked
- // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
- typedef std::unordered_map> PrePackedTensorProtoToSave;
-
#if !defined(ORT_MINIMAL_BUILD)
/** Gets the GraphProto representation of this Graph. */
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1187,26 +1182,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
@param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
in the external file. Initializer smaller than this threshold are included in the onnx file.
@param align_info offset alignment info.
- @param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
- If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
- we keep constant initializer as it is.
- @param pre_packed_initializers struct used to store all the prepacked initializers.
@returns GraphProto serialization of the graph.
*/
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold,
- const OffsetAlignmentInfo& align_info,
- bool save_prepacked_constant_initializers,
- PrePackedTensorProtoToSave& pre_packed_initializers) const;
+ const OffsetAlignmentInfo& align_info) const;
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold) const {
OffsetAlignmentInfo default_options;
- PrePackedTensorProtoToSave pre_packed_initializers;
- return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
- false, pre_packed_initializers);
+ return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
}
/** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1521,18 +1508,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
private:
void InitializeStateFromModelFileGraphProto();
- // Private method used to setup external initializer properly during model save,
- // this external initializer could be oroginal initializer or prepacked initializer.
- static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
- size_t tensor_bytes_size,
- int64_t& external_offset,
- std::ofstream& external_stream,
- gsl::span raw_data,
- ONNX_NAMESPACE::TensorProto& output_proto,
- const std::filesystem::path& external_file_path,
- const ONNX_NAMESPACE::TensorProto& initializer,
- bool is_prepacked);
-
// Add node with specified .
Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
const ArgNameToTypeMap& name_to_type);
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
index 6cff153c336f0..31b0f22340510 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -53,6 +53,7 @@ InlinedVector> GenerateTransformers(
TransformerLevel level,
const SessionOptions& session_options,
const IExecutionProvider& execution_provider /*required by constant folding*/,
+ const logging::Logger& logger,
const InlinedHashSet& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map>* p_buffered_tensors = nullptr);
@@ -84,6 +85,7 @@ InlinedVector> GenerateTransformersForMinimalB
const SessionOptions& session_options,
const SatApplyContextVariant& apply_context,
const IExecutionProvider& cpu_execution_provider,
+ const logging::Logger& logger,
const InlinedHashSet& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map>* p_buffered_tensors = nullptr);
diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 98fa9e09f1ba8..d035fd34bd072 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -41,6 +41,27 @@ enum COREMLFlags {
COREML_FLAG_LAST = COREML_FLAG_USE_CPU_AND_GPU,
};
+// MLComputeUnits can be one of the following values:
+// 'MLComputeUnitsCPUAndNeuralEngine|MLComputeUnitsCPUAndGPU|MLComputeUnitsCPUOnly|MLComputeUnitsAll'
+// these values are intended to be used with Ort::SessionOptions::AppendExecutionProvider (C++ API)
+// and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
+static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
+static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
+// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
+static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
+static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
+// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
+// Core ML segments the model’s compute graph and specializes each segment for the target compute device.
+// This process can affect the model loading time and the prediction latency.
+// Use this option to tailor the specialization strategy for your model.
+static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
+// Profile the Core ML MLComputePlan.
+// This logs the hardware each operator is dispatched to and the estimated execution time.
+// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
+static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
+// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
+static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
+
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b0c5d2329c428..a35d975ac8f1b 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -626,8 +626,13 @@ typedef struct OrtMIGraphXProviderOptions {
} OrtMIGraphXProviderOptions;
/** \brief OpenVINO Provider Options
- *
- * \see OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+ * \brief This Struct is frozen since ORT 1.13.0. Its maintained part of Legacy API for compatibility.
+ * \brief For latest OpenVINO Provider Options update to the ProviderOptions map.
+ * \brief Latest OpenVINO Provider Options are listed in the
+ * \htmlonly
+ * onnxruntime document.
+ * \endhtmlonly
+ * \see OrtApi::SessionOptionsAppendExecutionProvider()
*/
typedef struct OrtOpenVINOProviderOptions {
#ifdef __cplusplus
@@ -645,7 +650,7 @@ typedef struct OrtOpenVINOProviderOptions {
* Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
*/
const char* device_type;
- unsigned char enable_npu_fast_compile;
+ unsigned char enable_npu_fast_compile; ///< 0 = disabled, nonzero = enabled
const char* device_id;
size_t num_of_threads; ///< 0 = Use default number of threads
const char* cache_dir; // path is set to empty by default
@@ -3662,6 +3667,9 @@ struct OrtApi {
* execution provider (typically CPU EP).
* - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled.
+ * "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
+ * - "0": Default. Disabled.
+ * - "1": Enabled.
*
* SNPE supported keys:
* "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
@@ -4607,6 +4615,8 @@ struct OrtApi {
* \param[in] num_keys
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.17.
*/
ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
_In_ OrtSessionOptions* options,
@@ -4624,6 +4634,8 @@ struct OrtApi {
* \param[in] num_keys
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.18.
*/
ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_VitisAI,
_In_ OrtSessionOptions* options,
@@ -4637,7 +4649,10 @@ struct OrtApi {
* \param[in] mem_info OrtMemoryInfo instance
* \param[in] count_or_bytes How many bytes is this scratch buffer
* \param[out] out A pointer to the scrach buffer
+ *
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.18.
*/
ORT_API2_STATUS(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
@@ -4648,6 +4663,8 @@ struct OrtApi {
* \param[out] out A pointer to OrtAllocator
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.18.
*/
ORT_API2_STATUS(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
@@ -4669,6 +4686,8 @@ struct OrtApi {
* \param[in] num_external_initializer_files Number of external files
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.18.
*/
ORT_API2_STATUS(AddExternalInitializersFromFilesInMemory, _In_ OrtSessionOptions* options,
_In_reads_(num_external_initializer_files) const ORTCHAR_T* const* external_initializer_file_names,
@@ -4691,6 +4710,8 @@ struct OrtApi {
* OrtApi::ReleaseLoraAdapter.
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.20.
*/
ORT_API2_STATUS(CreateLoraAdapter, const ORTCHAR_T* adapter_file_path, _In_ OrtAllocator* allocator,
_Outptr_ OrtLoraAdapter** out);
@@ -4709,6 +4730,8 @@ struct OrtApi {
* OrtApi::ReleaseLoraAdapter.
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.20.
*/
ORT_API2_STATUS(CreateLoraAdapterFromArray, _In_ const void* bytes, size_t num_bytes, _In_ OrtAllocator* allocator,
_Outptr_ OrtLoraAdapter** out);
@@ -4730,6 +4753,8 @@ struct OrtApi {
* \param[in] adapter OrtLoraAdapter instance
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.20.
*/
ORT_API2_STATUS(RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptions* options, _In_ const OrtLoraAdapter* adapter);
@@ -4748,6 +4773,8 @@ struct OrtApi {
* \param[in] kv_len Number of elements in the keys and values arrays
*
* \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.20.
*/
ORT_API2_STATUS(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
_In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 086919913cbea..6a01602e634f8 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -246,12 +246,6 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
"session.optimized_model_external_initializers_file_name";
-// Use this config when save prepacked constant initializers to onnx external data file.
-// Default is not save prepacked initializers to onnx data file.
-// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers', "1")
-static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
- "session.save_prepacked_constant_initializers";
-
// Use this config to control the minimum size of the initializer when externalizing it during serialization
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";
diff --git a/java/build-android.gradle b/java/build-android.gradle
index d5839f9f27869..9c4275b74f626 100644
--- a/java/build-android.gradle
+++ b/java/build-android.gradle
@@ -82,7 +82,7 @@ allprojects {
}
android {
- compileSdkVersion 32
+ compileSdkVersion 34
defaultConfig {
minSdkVersion minSdkVer
@@ -108,8 +108,8 @@ android {
}
compileOptions {
- sourceCompatibility = JavaVersion.VERSION_1_8
- targetCompatibility = JavaVersion.VERSION_1_8
+ sourceCompatibility = JavaVersion.VERSION_17
+ targetCompatibility = JavaVersion.VERSION_17
}
sourceSets {
diff --git a/java/build.gradle b/java/build.gradle
index 34ac93cce6f4e..845121dd17a48 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -50,8 +50,8 @@ mavenSettings {
}
java {
- sourceCompatibility = JavaVersion.VERSION_1_8
- targetCompatibility = JavaVersion.VERSION_1_8
+ sourceCompatibility = JavaVersion.VERSION_17
+ targetCompatibility = JavaVersion.VERSION_17
}
// This jar tasks serves as a CMAKE signaling
diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties
index 4baf5a11d45a3..381baa9cef1ec 100644
--- a/java/gradle/wrapper/gradle-wrapper.properties
+++ b/java/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,7 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
+distributionSha256Sum=544c35d6bd849ae8a5ed0bcea39ba677dc40f49df7d1835561582da2009b961d
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
diff --git a/java/gradlew.bat b/java/gradlew.bat
index 93e3f59f135dd..25da30dbdeee9 100644
--- a/java/gradlew.bat
+++ b/java/gradlew.bat
@@ -43,11 +43,11 @@ set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if %ERRORLEVEL% equ 0 goto execute
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
+echo. 1>&2
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
goto fail
@@ -57,11 +57,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto execute
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
+echo. 1>&2
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
goto fail
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 7280f3c88e2e8..32dc9d9f84aaa 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1323,6 +1323,18 @@ public void addQnn(Map providerOptions) throws OrtException {
addExecutionProvider(qnnProviderName, providerOptions);
}
+ /**
+ * Adds CoreML as an execution backend.
+ *
+ * @param providerOptions Configuration options for the CoreML backend. Refer to the CoreML
+ * execution provider's documentation.
+ * @throws OrtException If there was an error in native code.
+ */
+ public void addCoreML(Map providerOptions) throws OrtException {
+ String CoreMLProviderName = "CoreML";
+ addExecutionProvider(CoreMLProviderName, providerOptions);
+ }
+
private native void setExecutionMode(long apiHandle, long nativeHandle, int mode)
throws OrtException;
diff --git a/java/src/test/android/README.md b/java/src/test/android/README.md
index b84021669c9fe..b086be3dc904c 100644
--- a/java/src/test/android/README.md
+++ b/java/src/test/android/README.md
@@ -29,6 +29,11 @@ Use the android's [build instructions](https://onnxruntime.ai/docs/build/android
Please note that you may need to set the `--android_abi=x86_64` (the default option is `arm64-v8a`). This is because android instrumentation test is run on an android emulator which requires an abi of `x86_64`.
+#### QNN Builds
+We use two AndroidManifest.xml files to manage different runtime requirements for QNN support. In the [build configuration](app/build.gradle), we specify which manifest file to use based on the qnnVersion.
+In the [QNN manifest](app/src/main/AndroidManifestQnn.xml), we include the declaration for libcdsprpc.so, which is required for devices using QNN and Qualcomm DSP capabilities.
+For QNN builds, it is also necessary to set the `ADSP_LIBRARY_PATH` environment variable to the [native library directory](https://developer.android.com/reference/android/content/pm/ApplicationInfo#nativeLibraryDir) depending on the device. This ensures that any native libraries downloaded as dependencies such as QNN libraries are found by the application. This is conditionally added by using the BuildConfig field IS_QNN_BUILD set in the build.gradle file.
+
#### Build Output
The build will generate two apks which is required to run the test application in `$YOUR_BUILD_DIR/java/androidtest/android/app/build/outputs/apk`:
diff --git a/java/src/test/android/app/build.gradle b/java/src/test/android/app/build.gradle
index 381de06cc09de..baf18e714d25c 100644
--- a/java/src/test/android/app/build.gradle
+++ b/java/src/test/android/app/build.gradle
@@ -4,18 +4,27 @@ plugins {
}
def minSdkVer = System.properties.get("minSdkVer")?:24
+def qnnVersion = System.properties['qnnVersion']
android {
- compileSdkVersion 32
+ compileSdkVersion 34
defaultConfig {
applicationId "ai.onnxruntime.example.javavalidator"
minSdkVersion minSdkVer
- targetSdkVersion 32
+ targetSdkVersion 34
versionCode 1
versionName "1.0"
testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+
+ // Add BuildConfig field for qnnVersion
+ if (qnnVersion != null) {
+ buildConfigField "boolean", "IS_QNN_BUILD", "true"
+ }
+ else {
+ buildConfigField "boolean", "IS_QNN_BUILD", "false"
+ }
}
buildTypes {
@@ -25,11 +34,29 @@ android {
}
}
compileOptions {
- sourceCompatibility JavaVersion.VERSION_1_8
- targetCompatibility JavaVersion.VERSION_1_8
+ sourceCompatibility JavaVersion.VERSION_17
+ targetCompatibility JavaVersion.VERSION_17
}
kotlinOptions {
- jvmTarget = '1.8'
+ jvmTarget = '17'
+ }
+ // Conditional packagingOptions for QNN builds only
+ if (qnnVersion != null) {
+ packagingOptions {
+ jniLibs {
+ useLegacyPackaging = true
+ }
+ // Dsp is used in older QC devices and not supported by ORT
+ // Gpu support isn't the target, we just want Npu support (Htp)
+ exclude 'lib/arm64-v8a/libQnnGpu.so'
+ exclude 'lib/arm64-v8a/libQnnDsp*.so'
+ }
+
+ sourceSets {
+ main {
+ manifest.srcFile 'src/main/AndroidManifestQnn.xml' // Use QNN manifest
+ }
+ }
}
namespace 'ai.onnxruntime.example.javavalidator'
}
@@ -42,11 +69,20 @@ dependencies {
implementation 'com.google.android.material:material:1.3.0'
implementation 'androidx.constraintlayout:constraintlayout:2.0.4'
testImplementation 'junit:junit:4.+'
- androidTestImplementation 'androidx.test.ext:junit:1.1.3'
- androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
- implementation(name: "onnxruntime-android", ext: "aar")
+ androidTestImplementation "androidx.test.ext:junit:1.1.5"
+ androidTestImplementation "androidx.test.espresso:espresso-core:3.5.0"
- androidTestImplementation 'androidx.test:runner:1.4.0'
- androidTestImplementation 'androidx.test:rules:1.4.0'
+ androidTestImplementation "androidx.test:runner:1.5.2"
+ androidTestImplementation "androidx.test:rules:1.5.0"
androidTestImplementation 'com.microsoft.appcenter:espresso-test-extension:1.4'
+
+ // dependencies for onnxruntime-android-qnn
+ if (qnnVersion != null) {
+ implementation(name: "onnxruntime-android-qnn", ext: "aar")
+ implementation "com.qualcomm.qti:qnn-runtime:$qnnVersion"
+ }
+ else {
+ implementation(name: "onnxruntime-android", ext: "aar")
+ }
+
}
diff --git a/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt b/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
index 166803ae263a5..5e6bee6cac9f4 100644
--- a/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
+++ b/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
@@ -38,13 +38,18 @@ class SimpleTest {
@Test
fun runSigmoidModelTest() {
for (intraOpNumThreads in 1..4) {
- runSigmoidModelTestImpl(intraOpNumThreads)
+ runSigmoidModelTestImpl(intraOpNumThreads, OrtProvider.CPU)
}
}
@Test
fun runSigmoidModelTestNNAPI() {
- runSigmoidModelTestImpl(1, true)
+ runSigmoidModelTestImpl(1, OrtProvider.NNAPI)
+ }
+
+ @Test
+ fun runSigmoidModelTestQNN() {
+ runSigmoidModelTestImpl(1, OrtProvider.QNN)
}
@Throws(IOException::class)
@@ -54,22 +59,49 @@ class SimpleTest {
}
@Throws(OrtException::class, IOException::class)
- fun runSigmoidModelTestImpl(intraOpNumThreads: Int, useNNAPI: Boolean = false) {
- reportHelper.label("Start Running Test with intraOpNumThreads=$intraOpNumThreads, useNNAPI=$useNNAPI")
+ fun runSigmoidModelTestImpl(intraOpNumThreads: Int, executionProvider: OrtProvider) {
+ reportHelper.label("Start Running Test with intraOpNumThreads=$intraOpNumThreads, executionProvider=$executionProvider")
Log.println(Log.INFO, TAG, "Testing with intraOpNumThreads=$intraOpNumThreads")
- Log.println(Log.INFO, TAG, "Testing with useNNAPI=$useNNAPI")
+ Log.println(Log.INFO, TAG, "Testing with executionProvider=$executionProvider")
+
val env = OrtEnvironment.getEnvironment(OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE)
env.use {
val opts = SessionOptions()
opts.setIntraOpNumThreads(intraOpNumThreads)
- if (useNNAPI) {
- if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.NNAPI)) {
- opts.addNnapi()
- } else {
- Log.println(Log.INFO, TAG, "NO NNAPI EP available, skip the test")
- return
+
+ when (executionProvider) {
+
+ OrtProvider.NNAPI -> {
+ if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.NNAPI)) {
+ opts.addNnapi()
+ } else {
+ Log.println(Log.INFO, TAG, "NO NNAPI EP available, skip the test")
+ return
+ }
+ }
+
+ OrtProvider.QNN -> {
+ if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.QNN)) {
+ // Since this is running in an Android environment, we use the .so library
+ val qnnLibrary = "libQnnHtp.so"
+ val providerOptions = Collections.singletonMap("backend_path", qnnLibrary)
+ opts.addQnn(providerOptions)
+ } else {
+ Log.println(Log.INFO, TAG, "NO QNN EP available, skip the test")
+ return
+ }
+ }
+
+ OrtProvider.CPU -> {
+ // No additional configuration is needed for CPU
+ }
+
+ else -> {
+ // Non exhaustive when statements on enum will be prohibited in future Gradle versions
+ Log.println(Log.INFO, TAG, "Skipping test as OrtProvider is not implemented")
}
}
+
opts.use {
val session = env.createSession(readModel("sigmoid.ort"), opts)
session.use {
@@ -92,13 +124,15 @@ class SimpleTest {
output.use {
@Suppress("UNCHECKED_CAST")
val rawOutput = output[0].value as Array>
+ // QNN EP will run the Sigmoid float32 op with fp16 precision
+ val precision = if (executionProvider == OrtProvider.QNN) 1e-3 else 1e-6
for (i in 0..2) {
for (j in 0..3) {
for (k in 0..4) {
Assert.assertEquals(
rawOutput[i][j][k],
expected[i][j][k],
- 1e-6.toFloat()
+ precision.toFloat()
)
}
}
diff --git a/java/src/test/android/app/src/main/AndroidManifest.xml b/java/src/test/android/app/src/main/AndroidManifest.xml
index 2938b7e8bf409..08a612ed79fd6 100644
--- a/java/src/test/android/app/src/main/AndroidManifest.xml
+++ b/java/src/test/android/app/src/main/AndroidManifest.xml
@@ -17,4 +17,4 @@
-
\ No newline at end of file
+
diff --git a/java/src/test/android/app/src/main/AndroidManifestQnn.xml b/java/src/test/android/app/src/main/AndroidManifestQnn.xml
new file mode 100644
index 0000000000000..c9416523a9c91
--- /dev/null
+++ b/java/src/test/android/app/src/main/AndroidManifestQnn.xml
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt b/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
index 62e23c4b9b862..3b3a2d057b16e 100644
--- a/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
+++ b/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
@@ -1,11 +1,19 @@
package ai.onnxruntime.example.javavalidator
import android.os.Bundle
+import android.system.Os
import androidx.appcompat.app.AppCompatActivity
/*Empty activity app mainly used for testing*/
class MainActivity : AppCompatActivity() {
override fun onCreate(savedInstanceState: Bundle?) {
+ if (BuildConfig.IS_QNN_BUILD) {
+ val adspLibraryPath = applicationContext.applicationInfo.nativeLibraryDir
+ // set the path variable to the native library directory
+ // so that any native libraries downloaded as dependencies
+ // (like qnn libs) are found
+ Os.setenv("ADSP_LIBRARY_PATH", adspLibraryPath, true)
+ }
super.onCreate(savedInstanceState)
}
-}
\ No newline at end of file
+}
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index 57c4eb3577fd0..fa0b6fd0ef9d9 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,6 +27,7 @@
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
public class ProviderOptionsTest {
@@ -34,6 +35,7 @@ public class ProviderOptionsTest {
@Test
@EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
+ @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testCUDAOptions() throws OrtException {
// Test standard options
OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -61,6 +63,7 @@ public void testCUDAOptions() throws OrtException {
@Test
@EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
+ @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testTensorRT() throws OrtException {
// Test standard options
OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
diff --git a/js/.eslintrc.js b/js/.eslintrc.js
index bd1e9061355f5..462e417df1d66 100644
--- a/js/.eslintrc.js
+++ b/js/.eslintrc.js
@@ -198,19 +198,6 @@ module.exports = {
'_OrtReleaseTensor',
'_OrtRun',
'_OrtRunWithBinding',
- '_OrtTrainingCopyParametersFromBuffer',
- '_OrtTrainingCopyParametersToBuffer',
- '_OrtTrainingCreateSession',
- '_OrtTrainingEvalStep',
- '_OrtTrainingGetModelInputOutputCount',
- '_OrtTrainingGetModelInputOutputName',
- '_OrtTrainingGetParametersSize',
- '_OrtTrainingLazyResetGrad',
- '_OrtTrainingLoadCheckpoint',
- '_OrtTrainingOptimizerStep',
- '_OrtTrainingReleaseCheckpoint',
- '_OrtTrainingReleaseSession',
- '_OrtTrainingRunTrainStep',
],
},
],
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index e27e67622aa82..e63f9c6c9147f 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -3,7 +3,6 @@
import { InferenceSession } from './inference-session.js';
import { OnnxValue } from './onnx-value.js';
-import { TrainingSession } from './training-session.js';
/**
* @ignore
@@ -42,33 +41,6 @@ export interface InferenceSessionHandler extends SessionHandler {
): Promise;
}
-/**
- * Represent a handler instance of a training inference session.
- *
- * @ignore
- */
-export interface TrainingSessionHandler extends SessionHandler {
- readonly evalInputNames: readonly string[];
- readonly evalOutputNames: readonly string[];
-
- lazyResetGrad(): Promise;
- runTrainStep(
- feeds: SessionHandler.FeedsType,
- fetches: SessionHandler.FetchesType,
- options: InferenceSession.RunOptions,
- ): Promise;
- runOptimizerStep(options: InferenceSession.RunOptions): Promise;
- runEvalStep(
- feeds: SessionHandler.FeedsType,
- fetches: SessionHandler.FetchesType,
- options: InferenceSession.RunOptions,
- ): Promise;
-
- getParametersSize(trainableOnly: boolean): Promise;
- loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise;
- getContiguousParameters(trainableOnly: boolean): Promise;
-}
-
/**
* Represent a backend that provides implementation of model inferencing.
*
@@ -84,14 +56,6 @@ export interface Backend {
uriOrBuffer: string | Uint8Array,
options?: InferenceSession.SessionOptions,
): Promise;
-
- createTrainingSessionHandler?(
- checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer,
- trainModelUriOrBuffer: TrainingSession.UriOrBuffer,
- evalModelUriOrBuffer: TrainingSession.UriOrBuffer,
- optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer,
- options: InferenceSession.SessionOptions,
- ): Promise;
}
export { registerBackend } from './backend-impl.js';
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 642a897a90d26..e70f608ad7030 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -2,6 +2,7 @@
// Licensed under the MIT License.
import { env as envImpl } from './env-impl.js';
+import { TryGetGlobalType } from './type-helper.js';
export declare namespace Env {
export type WasmPathPrefix = string;
@@ -14,7 +15,6 @@ export declare namespace Env {
* If not modified, the filename of the .wasm file is:
* - `ort-wasm-simd-threaded.wasm` for default build
* - `ort-wasm-simd-threaded.jsep.wasm` for JSEP build (with WebGPU and WebNN)
- * - `ort-training-wasm-simd-threaded.wasm` for training build
*/
wasm?: URL | string;
/**
@@ -25,7 +25,6 @@ export declare namespace Env {
* If not modified, the filename of the .mjs file is:
* - `ort-wasm-simd-threaded.mjs` for default build
* - `ort-wasm-simd-threaded.jsep.mjs` for JSEP build (with WebGPU and WebNN)
- * - `ort-training-wasm-simd-threaded.mjs` for training build
*/
mjs?: URL | string;
}
@@ -200,22 +199,16 @@ export declare namespace Env {
* value will be the GPU adapter that created by the underlying WebGPU backend.
*
* When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
- * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
- *
- * see comments on {@link Tensor.GpuBufferType}
*/
- adapter: unknown;
+ adapter: TryGetGlobalType<'GPUAdapter'>;
/**
* Get the device for WebGPU.
*
* This property is only available after the first WebGPU inference session is created.
*
* When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
- * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
- *
- * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types".
*/
- readonly device: unknown;
+ readonly device: TryGetGlobalType<'GPUDevice'>;
/**
* Set or get whether validate input content.
*
diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index 3ed56b3c2e812..d75e6a477258d 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -26,4 +26,3 @@ export * from './tensor-factory.js';
export * from './trace.js';
export * from './onnx-model.js';
export * from './onnx-value.js';
-export * from './training-session.js';
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 547db029471a2..e62c6579e8333 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -4,6 +4,7 @@
import { InferenceSession as InferenceSessionImpl } from './inference-session-impl.js';
import { OnnxModelOptions } from './onnx-model.js';
import { OnnxValue, OnnxValueDataLocation } from './onnx-value.js';
+import { TryGetGlobalType } from './type-helper.js';
/* eslint-disable @typescript-eslint/no-redeclare */
@@ -282,7 +283,7 @@ export declare namespace InferenceSession {
extends WebNNExecutionProviderName,
Omit,
Required> {
- context: unknown /* MLContext */;
+ context: TryGetGlobalType<'MLContext'>;
}
/**
@@ -291,8 +292,8 @@ export declare namespace InferenceSession {
* @see https://www.w3.org/TR/webnn/#dom-ml-createcontext-gpudevice
*/
export interface WebNNOptionsWebGpu extends WebNNExecutionProviderName {
- context: unknown /* MLContext */;
- gpuDevice: unknown /* GPUDevice */;
+ context: TryGetGlobalType<'MLContext'>;
+ gpuDevice: TryGetGlobalType<'GPUDevice'>;
}
/**
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index af918705b97e3..05553bd96662b 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -4,6 +4,7 @@
import { TensorFactory } from './tensor-factory.js';
import { Tensor as TensorImpl } from './tensor-impl.js';
import { TypedTensorUtils } from './tensor-utils.js';
+import { TryGetGlobalType } from './type-helper.js';
/* eslint-disable @typescript-eslint/no-redeclare */
@@ -131,24 +132,19 @@ export declare namespace Tensor {
*/
export type TextureDataTypes = 'float32';
+ type GpuBufferTypeFallback = { size: number; mapState: 'unmapped' | 'pending' | 'mapped' };
/**
* type alias for WebGPU buffer
- *
- * The reason why we don't use type "GPUBuffer" defined in webgpu.d.ts from @webgpu/types is because "@webgpu/types"
- * requires "@types/dom-webcodecs" as peer dependency when using TypeScript < v5.1 and its version need to be chosen
- * carefully according to the TypeScript version being used. This means so far there is not a way to keep every
- * TypeScript version happy. It turns out that we will easily broke users on some TypeScript version.
- *
- * for more info see https://github.com/gpuweb/types/issues/127
*/
- export type GpuBufferType = { size: number; mapState: 'unmapped' | 'pending' | 'mapped' };
+ export type GpuBufferType = TryGetGlobalType<'GPUBuffer', GpuBufferTypeFallback>;
+ type MLTensorTypeFallback = { destroy(): void };
/**
* type alias for WebNN MLTensor
*
* The specification for WebNN's MLTensor is currently in flux.
*/
- export type MLTensorType = unknown;
+ export type MLTensorType = TryGetGlobalType<'MLTensor', MLTensorTypeFallback>;
/**
* supported data types for constructing a tensor from a WebGPU buffer
diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
deleted file mode 100644
index 21dbe5fe51bb9..0000000000000
--- a/js/common/lib/training-session-impl.ts
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import { resolveBackendAndExecutionProviders } from './backend-impl.js';
-import { SessionHandler, TrainingSessionHandler } from './backend.js';
-import { InferenceSession as InferenceSession } from './inference-session.js';
-import { OnnxValue } from './onnx-value.js';
-import { Tensor } from './tensor.js';
-import { TrainingSession as TrainingSessionInterface, TrainingSessionCreateOptions } from './training-session.js';
-
-type SessionOptions = InferenceSession.SessionOptions;
-type FeedsType = InferenceSession.FeedsType;
-type FetchesType = InferenceSession.FetchesType;
-type ReturnType = InferenceSession.ReturnType;
-type RunOptions = InferenceSession.RunOptions;
-
-const noBackendErrMsg: string =
- 'Training backend could not be resolved. ' + "Make sure you're using the correct configuration & WebAssembly files.";
-
-export class TrainingSession implements TrainingSessionInterface {
- private constructor(handler: TrainingSessionHandler, hasOptimizerModel: boolean, hasEvalModel: boolean) {
- this.handler = handler;
- this.hasOptimizerModel = hasOptimizerModel;
- this.hasEvalModel = hasEvalModel;
- }
- private handler: TrainingSessionHandler;
- private hasOptimizerModel: boolean;
- private hasEvalModel: boolean;
-
- get trainingInputNames(): readonly string[] {
- return this.handler.inputNames;
- }
- get trainingOutputNames(): readonly string[] {
- return this.handler.outputNames;
- }
-
- get evalInputNames(): readonly string[] {
- if (this.hasEvalModel) {
- return this.handler.evalInputNames;
- } else {
- throw new Error('This training session has no evalModel loaded.');
- }
- }
- get evalOutputNames(): readonly string[] {
- if (this.hasEvalModel) {
- return this.handler.evalOutputNames;
- } else {
- throw new Error('This training session has no evalModel loaded.');
- }
- }
-
- static async create(
- trainingOptions: TrainingSessionCreateOptions,
- sessionOptions?: SessionOptions,
- ): Promise {
- const evalModel: string | Uint8Array = trainingOptions.evalModel || '';
- const optimizerModel: string | Uint8Array = trainingOptions.optimizerModel || '';
- const options: SessionOptions = sessionOptions || {};
-
- // resolve backend, update session options with validated EPs, and create session handler
- const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
- if (backend.createTrainingSessionHandler) {
- const handler = await backend.createTrainingSessionHandler(
- trainingOptions.checkpointState,
- trainingOptions.trainModel,
- evalModel,
- optimizerModel,
- optionsWithValidatedEPs,
- );
- return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel);
- } else {
- throw new Error(noBackendErrMsg);
- }
- }
-
- /**
- * Helper function for runTrainStep and future runStep methods that handles the type-narrowing conversion from
- * the given parameters to SessionHandler.FetchesType and RunOptions.
- *
- * @param inputNames the feeds object is checked that they contain all input names in the provided list of input
- * names.
- * @param outputNames the fetches object is checked that their keys match up with valid names in the list of output
- * names.
- * @param feeds the required input
- * @param arg1 narrowed & converted into the SessionHandler.FetchesType or RunOptions object
- * @param arg2 optional RunOptions object.
- * @returns
- */
- typeNarrowingForRunStep(
- inputNames: readonly string[],
- outputNames: readonly string[],
- feeds: FeedsType,
- arg1?: FetchesType | RunOptions,
- arg2?: RunOptions,
- ): [SessionHandler.FetchesType, RunOptions] {
- const fetches: { [name: string]: OnnxValue | null } = {};
- let options: RunOptions = {};
- // check inputs
- if (typeof feeds !== 'object' || feeds === null || feeds instanceof Tensor || Array.isArray(feeds)) {
- throw new TypeError(
- "'feeds' must be an object that use input names as keys and OnnxValue as corresponding values.",
- );
- }
-
- let isFetchesEmpty = true;
- // determine which override is being used
- if (typeof arg1 === 'object') {
- if (arg1 === null) {
- throw new TypeError('Unexpected argument[1]: cannot be null.');
- }
- if (arg1 instanceof Tensor) {
- throw new TypeError("'fetches' cannot be a Tensor");
- }
-
- if (Array.isArray(arg1)) {
- if (arg1.length === 0) {
- throw new TypeError("'fetches' cannot be an empty array.");
- }
- isFetchesEmpty = false;
- // output names
- for (const name of arg1) {
- if (typeof name !== 'string') {
- throw new TypeError("'fetches' must be a string array or an object.");
- }
- if (outputNames.indexOf(name) === -1) {
- throw new RangeError(`'fetches' contains invalid output name: ${name}.`);
- }
- fetches[name] = null;
- }
-
- if (typeof arg2 === 'object' && arg2 !== null) {
- options = arg2;
- } else if (typeof arg2 !== 'undefined') {
- throw new TypeError("'options' must be an object.");
- }
- } else {
- // decide whether arg1 is fetches or options
- // if any output name is present and its value is valid OnnxValue, we consider it fetches
- let isFetches = false;
- const arg1Keys = Object.getOwnPropertyNames(arg1);
- for (const name of outputNames) {
- if (arg1Keys.indexOf(name) !== -1) {
- const v = (arg1 as InferenceSession.NullableOnnxValueMapType)[name];
- if (v === null || v instanceof Tensor) {
- isFetches = true;
- isFetchesEmpty = false;
- fetches[name] = v;
- }
- }
- }
-
- if (isFetches) {
- if (typeof arg2 === 'object' && arg2 !== null) {
- options = arg2;
- } else if (typeof arg2 !== 'undefined') {
- throw new TypeError("'options' must be an object.");
- }
- } else {
- options = arg1 as RunOptions;
- }
- }
- } else if (typeof arg1 !== 'undefined') {
- throw new TypeError("Unexpected argument[1]: must be 'fetches' or 'options'.");
- }
-
- // check if all inputs are in feed
- for (const name of inputNames) {
- if (typeof feeds[name] === 'undefined') {
- throw new Error(`input '${name}' is missing in 'feeds'.`);
- }
- }
-
- // if no fetches is specified, we use the full output names list
- if (isFetchesEmpty) {
- for (const name of outputNames) {
- fetches[name] = null;
- }
- }
-
- return [fetches, options];
- }
-
- /**
- * Helper method for runTrainStep and any other runStep methods. Takes the ReturnType result from the SessionHandler
- * and changes it into a map of Tensors.
- *
- * @param results
- * @returns
- */
- convertHandlerReturnTypeToMapOfTensors(results: SessionHandler.ReturnType): ReturnType {
- const returnValue: { [name: string]: OnnxValue } = {};
- for (const key in results) {
- if (Object.hasOwnProperty.call(results, key)) {
- const result = results[key];
- if (result instanceof Tensor) {
- returnValue[key] = result;
- } else {
- returnValue[key] = new Tensor(result.type, result.data, result.dims);
- }
- }
- }
- return returnValue;
- }
-
- async lazyResetGrad(): Promise {
- await this.handler.lazyResetGrad();
- }
-
- runTrainStep(feeds: FeedsType, options?: RunOptions): Promise;
- runTrainStep(feeds: FeedsType, fetches: FetchesType, options?: RunOptions): Promise;
- async runTrainStep(feeds: FeedsType, arg1?: FetchesType | RunOptions, arg2?: RunOptions): Promise {
- const [fetches, options] = this.typeNarrowingForRunStep(
- this.trainingInputNames,
- this.trainingOutputNames,
- feeds,
- arg1,
- arg2,
- );
- const results = await this.handler.runTrainStep(feeds, fetches, options);
- return this.convertHandlerReturnTypeToMapOfTensors(results);
- }
-
- async runOptimizerStep(options?: InferenceSession.RunOptions | undefined): Promise {
- if (this.hasOptimizerModel) {
- await this.handler.runOptimizerStep(options || {});
- } else {
- throw new Error('This TrainingSession has no OptimizerModel loaded.');
- }
- }
-
- runEvalStep(feeds: FeedsType, options?: RunOptions | undefined): Promise;
- runEvalStep(feeds: FeedsType, fetches: FetchesType, options?: RunOptions | undefined): Promise;
- async runEvalStep(feeds: FeedsType, arg1?: FetchesType | RunOptions, arg2?: RunOptions): Promise {
- if (this.hasEvalModel) {
- const [fetches, options] = this.typeNarrowingForRunStep(
- this.evalInputNames,
- this.evalOutputNames,
- feeds,
- arg1,
- arg2,
- );
- const results = await this.handler.runEvalStep(feeds, fetches, options);
- return this.convertHandlerReturnTypeToMapOfTensors(results);
- } else {
- throw new Error('This TrainingSession has no EvalModel loaded.');
- }
- }
-
- async getParametersSize(trainableOnly = true): Promise {
- return this.handler.getParametersSize(trainableOnly);
- }
-
- async loadParametersBuffer(array: Uint8Array, trainableOnly = true): Promise {
- const paramsSize = await this.getParametersSize(trainableOnly);
- // checking that the size of the Uint8Array is equivalent to the byte length of a Float32Array of the number
- // of parameters
- if (array.length !== 4 * paramsSize) {
- throw new Error(
- 'Size of the buffer passed into loadParametersBuffer must match the number of parameters in ' +
- 'the model. Please use getParametersSize method to check.',
- );
- }
- return this.handler.loadParametersBuffer(array, trainableOnly);
- }
-
- async getContiguousParameters(trainableOnly = true): Promise {
- return this.handler.getContiguousParameters(trainableOnly);
- }
-
- async release(): Promise {
- return this.handler.dispose();
- }
-}
diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
deleted file mode 100644
index 45dcafc46deb5..0000000000000
--- a/js/common/lib/training-session.ts
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import { InferenceSession } from './inference-session.js';
-import { OnnxValue } from './onnx-value.js';
-import { TrainingSession as TrainingSessionImpl } from './training-session-impl.js';
-
-/* eslint-disable @typescript-eslint/no-redeclare */
-
-export declare namespace TrainingSession {
- /**
- * Either URI file path (string) or Uint8Array containing model or checkpoint information.
- */
- type UriOrBuffer = string | Uint8Array;
-}
-
-/**
- * Represent a runtime instance of an ONNX training session,
- * which contains a model that can be trained, and, optionally,
- * an eval and optimizer model.
- */
-export interface TrainingSession {
- // #region run()
-
- /**
- * Lazily resets the gradients of all trainable parameters to zero. Should happen after the invocation of
- * runOptimizerStep.
- */
- lazyResetGrad(): Promise;
-
- /**
- * Run TrainStep asynchronously with the given feeds and options.
- *
- * @param feeds - Representation of the model input. See type description of `InferenceSession.InputType` for
- detail.
- * @param options - Optional. A set of options that controls the behavior of model training.
- * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding values.
- */
- runTrainStep(
- feeds: InferenceSession.FeedsType,
- options?: InferenceSession.RunOptions,
- ): Promise;
-
- /**
- * Run a single train step with the given inputs and options.
- *
- * @param feeds - Representation of the model input.
- * @param fetches - Representation of the model output.
- * detail.
- * @param options - Optional. A set of options that controls the behavior of model training.
- * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
- values.
- */
- runTrainStep(
- feeds: InferenceSession.FeedsType,
- fetches: InferenceSession.FetchesType,
- options?: InferenceSession.RunOptions,
- ): Promise;
-
- /**
- * Runs a single optimizer step, which performs weight updates for the trainable parameters using the optimizer model.
- *
- * @param options - Optional. A set of options that controls the behavior of model optimizing.
- */
- runOptimizerStep(options?: InferenceSession.RunOptions): Promise;
-
- /**
- * Run a single eval step with the given inputs and options using the eval model.
- *
- * @param feeds - Representation of the model input.
- * @param options - Optional. A set of options that controls the behavior of model eval step.
- * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
- values.
- */
- runEvalStep(
- feeds: InferenceSession.FeedsType,
- options?: InferenceSession.RunOptions,
- ): Promise;
-
- /**
- * Run a single eval step with the given inputs and options using the eval model.
- *
- * @param feeds - Representation of the model input.
- * @param fetches - Representation of the model output.
- * detail.
- * @param options - Optional. A set of options that controls the behavior of model eval step.
- * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
- values.
- */
- runEvalStep(
- feeds: InferenceSession.FeedsType,
- fetches: InferenceSession.FetchesType,
- options?: InferenceSession.RunOptions,
- ): Promise;
-
- // #endregion
-
- // #region copy parameters
-
- /**
- * Retrieves the size of all parameters for the training state. Calculates the total number of primitive (datatype of
- * the parameters) elements of all the parameters in the training state.
- *
- * @param trainableOnly - When set to true, the size is calculated for trainable params only. Default value is true.
- */
- getParametersSize(trainableOnly: boolean): Promise;
-
- /**
- * Copies parameter values from the given buffer to the training state. Currently, only supporting models with
- * parameters of type Float32.
- *
- * @param buffer - A Uint8Array representation of Float32 parameters.
- * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
- */
- loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise;
-
- /**
- * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
- * Currently, only supporting models with parameters of type Float32.
- *
- * @param trainableOnly - When set to true, only trainable parameters are copied. Trainable parameters are parameters
- * for which requires_grad is set to true. Default value is true.
- * @returns A promise that resolves to a Float32 OnnxValue of the requested parameters.
- */
- getContiguousParameters(trainableOnly: boolean): Promise;
- // #endregion
-
- // #region release()
-
- /**
- * Release the inference session and the underlying resources.
- */
- release(): Promise;
- // #endregion
-
- // #region metadata
-
- /**
- * Get input names of the loaded training model.
- */
- readonly trainingInputNames: readonly string[];
-
- /**
- * Get output names of the loaded training model.
- */
- readonly trainingOutputNames: readonly string[];
-
- /**
- * Get input names of the loaded eval model. Is an empty array if no eval model is loaded.
- */
- readonly evalInputNames: readonly string[];
-
- /**
- * Get output names of the loaded eval model. Is an empty array if no eval model is loaded.
- */
- readonly evalOutputNames: readonly string[];
-
- // #endregion
-}
-
-/**
- * Represents the optional parameters that can be passed into the TrainingSessionFactory.
- */
-export interface TrainingSessionCreateOptions {
- /**
- * URI or buffer for a .ckpt file that contains the checkpoint for the training model.
- */
- checkpointState: TrainingSession.UriOrBuffer;
- /**
- * URI or buffer for the .onnx training file.
- */
- trainModel: TrainingSession.UriOrBuffer;
- /**
- * Optional. URI or buffer for the .onnx optimizer model file.
- */
- optimizerModel?: TrainingSession.UriOrBuffer;
- /**
- * Optional. URI or buffer for the .onnx eval model file.
- */
- evalModel?: TrainingSession.UriOrBuffer;
-}
-
-/**
- * Defines method overload possibilities for creating a TrainingSession.
- */
-export interface TrainingSessionFactory {
- // #region create()
-
- /**
- * Creates a new TrainingSession and asynchronously loads any models passed in through trainingOptions
- *
- * @param trainingOptions specify models and checkpoints to load into the Training Session
- * @param sessionOptions specify configuration for training session behavior
- *
- * @returns Promise that resolves to a TrainingSession object
- */
- create(
- trainingOptions: TrainingSessionCreateOptions,
- sessionOptions?: InferenceSession.SessionOptions,
- ): Promise;
-
- // #endregion
-}
-
-// eslint-disable-next-line @typescript-eslint/naming-convention
-export const TrainingSession: TrainingSessionFactory = TrainingSessionImpl;
diff --git a/js/common/lib/type-helper.ts b/js/common/lib/type-helper.ts
new file mode 100644
index 0000000000000..845ba3018d443
--- /dev/null
+++ b/js/common/lib/type-helper.ts
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/**
+ * A helper type to get certain types if they are declared in global scope.
+ *
+ * For example, if you installed "@webgpu/types" as a dev dependency, then `TryGetTypeIfDeclared<'GPUDevice'>` will
+ * be type `GPUDevice`, otherwise it will be type `unknown`.
+ *
+ *
+ * We don't want to introduce "@webgpu/types" as a dependency of this package because:
+ *
+ * (1) For JavaScript users, it's not needed. For TypeScript users, they can install it as dev dependency themselves.
+ *
+ * (2) because "@webgpu/types" requires "@types/dom-webcodecs" as peer dependency when using TypeScript < v5.1 and its
+ * version need to be chosen carefully according to the TypeScript version being used. This means so far there is not a
+ * way to keep every TypeScript version happy. It turns out that we will easily broke users on some TypeScript version.
+ *
+ * for more info see https://github.com/gpuweb/types/issues/127
+ *
+ * Update (2024-08-07): The reason (2) may be no longer valid. Most people should be using TypeScript >= 5.1 by now.
+ * However, we are still not sure whether introducing "@webgpu/types" as direct dependency is a good idea. We find this
+ * type helper is useful for TypeScript users.
+ *
+ * @ignore
+ */
+export type TryGetGlobalType = typeof globalThis extends {
+ [k in Name]: { prototype: infer T };
+}
+ ? T
+ : Fallback;
diff --git a/js/common/typedoc.json b/js/common/typedoc.json
index 088c7ba4053e6..f9c7e7b19db41 100644
--- a/js/common/typedoc.json
+++ b/js/common/typedoc.json
@@ -1,6 +1,7 @@
{
"entryPoints": ["lib/index.ts"],
"excludeInternal": true,
+ "intentionallyNotExported": ["TryGetGlobalType"],
"name": "ONNX Runtime JavaScript API",
"readme": "none",
"cleanOutputDir": true
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 239c0b1ba557b..6d3c96e579a47 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -276,12 +276,12 @@
"dev": true
},
"node_modules/axios": {
- "version": "1.6.1",
- "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.1.tgz",
- "integrity": "sha512-vfBmhDpKafglh0EldBEbVuoe7DyAavGSLWhuSm5ZSEKQnHhBf0xAAwybbNH1IkrJNGnS/VG4I5yxig1pCEXE4g==",
+ "version": "1.7.9",
+ "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.9.tgz",
+ "integrity": "sha512-LhLcE7Hbiryz8oMDdDptSrWowmB4Bl6RCt6sIJKpRB4XtVf0iEgewX3au/pJqm+Py1kCASkb/FFKjxQaLtxJvw==",
"dev": true,
"dependencies": {
- "follow-redirects": "^1.15.0",
+ "follow-redirects": "^1.15.6",
"form-data": "^4.0.0",
"proxy-from-env": "^1.1.0"
}
@@ -455,9 +455,9 @@
"dev": true
},
"node_modules/cross-spawn": {
- "version": "7.0.3",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
- "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+ "version": "7.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+ "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"dependencies": {
"path-key": "^3.1.0",
"shebang-command": "^2.0.0",
@@ -1581,12 +1581,12 @@
"dev": true
},
"axios": {
- "version": "1.6.1",
- "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.1.tgz",
- "integrity": "sha512-vfBmhDpKafglh0EldBEbVuoe7DyAavGSLWhuSm5ZSEKQnHhBf0xAAwybbNH1IkrJNGnS/VG4I5yxig1pCEXE4g==",
+ "version": "1.7.9",
+ "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.9.tgz",
+ "integrity": "sha512-LhLcE7Hbiryz8oMDdDptSrWowmB4Bl6RCt6sIJKpRB4XtVf0iEgewX3au/pJqm+Py1kCASkb/FFKjxQaLtxJvw==",
"dev": true,
"requires": {
- "follow-redirects": "^1.15.0",
+ "follow-redirects": "^1.15.6",
"form-data": "^4.0.0",
"proxy-from-env": "^1.1.0"
}
@@ -1725,9 +1725,9 @@
"dev": true
},
"cross-spawn": {
- "version": "7.0.3",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
- "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+ "version": "7.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+ "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"requires": {
"path-key": "^3.1.0",
"shebang-command": "^2.0.0",
diff --git a/js/node/script/install.js b/js/node/script/install.js
index b15bc03840599..fef93f9169a2c 100644
--- a/js/node/script/install.js
+++ b/js/node/script/install.js
@@ -21,6 +21,7 @@ const os = require('os');
const fs = require('fs');
const path = require('path');
const tar = require('tar');
+const { execFileSync } = require('child_process');
const { Readable } = require('stream');
// commandline flag:
@@ -58,10 +59,23 @@ if (NO_INSTALL || !shouldInstall) {
// Step.2: Download the required binaries
const artifactUrl = {
- 11: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-${
- ORT_VERSION
- }.tgz`,
- 12: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-cuda12-${
+ get 11() {
+ // TODO: support ORT Cuda v11 binaries
+ throw new Error(`CUDA 11 binaries are not supported by this script yet.
+
+To use ONNX Runtime Node.js binding with CUDA v11 support, please follow the manual steps:
+
+1. Use "--onnxruntime-node-install-cuda=skip" to skip the auto installation.
+2. Navigate to https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/onnxruntime-cuda-11
+3. Download the binaries for your platform and architecture
+4. Extract the following binaries to "node_modules/onnxruntime-node/bin/napi-v3/linux/x64:
+ - libonnxruntime_providers_tensorrt.so
+ - libonnxruntime_providers_shared.so
+ - libonnxruntime.so.${ORT_VERSION}
+ - libonnxruntime_providers_cuda.so
+`);
+ },
+ 12: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-${
ORT_VERSION
}.tgz`,
}[INSTALL_CUDA_FLAG || tryGetCudaVersion()];
@@ -108,9 +122,27 @@ Use "--onnxruntime-node-install-cuda=skip" to skip the installation. You will st
function tryGetCudaVersion() {
// Should only return 11 or 12.
- // TODO: try to get the CUDA version from the system ( `nvcc --version` )
+ // try to get the CUDA version from the system ( `nvcc --version` )
+ let ver = 12;
+ try {
+ const nvccVersion = execFileSync('nvcc', ['--version'], { encoding: 'utf8' });
+ const match = nvccVersion.match(/release (\d+)/);
+ if (match) {
+ ver = parseInt(match[1]);
+ if (ver !== 11 && ver !== 12) {
+ throw new Error(`Unsupported CUDA version: ${ver}`);
+ }
+ }
+ } catch (e) {
+ if (e?.code === 'ENOENT') {
+ console.warn('`nvcc` not found. Assuming CUDA 12.');
+ } else {
+ console.warn('Failed to detect CUDA version from `nvcc --version`:', e.message);
+ }
+ }
- return 11;
+ // assume CUDA 12 if failed to detect
+ return ver;
}
function parseInstallCudaFlag() {
diff --git a/js/node/tsconfig.json b/js/node/tsconfig.json
index c154c3e148ed0..0401fb9609ad6 100644
--- a/js/node/tsconfig.json
+++ b/js/node/tsconfig.json
@@ -1,7 +1,8 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
- "outDir": "dist"
+ "outDir": "dist",
+ "declaration": true
},
"include": ["lib"]
}
diff --git a/js/package-lock.json b/js/package-lock.json
index 594d0584ad80e..f4401c6e98c75 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -1573,9 +1573,9 @@
"dev": true
},
"node_modules/cross-spawn": {
- "version": "7.0.3",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
- "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+ "version": "7.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+ "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"dev": true,
"dependencies": {
"path-key": "^3.1.0",
@@ -5922,9 +5922,9 @@
"dev": true
},
"cross-spawn": {
- "version": "7.0.3",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
- "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+ "version": "7.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+ "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"dev": true,
"requires": {
"path-key": "^3.1.0",
diff --git a/js/react_native/android/build.gradle b/js/react_native/android/build.gradle
index 825990eba0fb8..521866ff0f3e2 100644
--- a/js/react_native/android/build.gradle
+++ b/js/react_native/android/build.gradle
@@ -7,7 +7,7 @@ buildscript {
}
dependencies {
- classpath 'com.android.tools.build:gradle:4.1.2'
+ classpath 'com.android.tools.build:gradle:7.4.2'
// noinspection DifferentKotlinGradleVersion
}
}
@@ -221,9 +221,8 @@ dependencies {
api "com.facebook.react:react-native:" + REACT_NATIVE_VERSION
api "org.mockito:mockito-core:2.28.2"
- androidTestImplementation "androidx.test:runner:1.1.0"
- androidTestImplementation "androidx.test:rules:1.1.0"
-
+ androidTestImplementation "androidx.test:runner:1.5.2"
+ androidTestImplementation "androidx.test:rules:1.5.0"
implementation "junit:junit:4.12"
androidTestImplementation "com.linkedin.dexmaker:dexmaker-mockito-inline-extended:2.28.1"
diff --git a/js/react_native/android/gradle.properties b/js/react_native/android/gradle.properties
index 465b04d1f5813..8fe6e40d76911 100644
--- a/js/react_native/android/gradle.properties
+++ b/js/react_native/android/gradle.properties
@@ -4,7 +4,7 @@
# Specifies the JVM arguments used for the daemon process.
# The setting is particularly useful for tweaking memory settings.
# Default value: -Xmx1024m -XX:MaxPermSize=256m
-# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8
+org.gradle.jvmargs=-Xmx4096m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8
#
# When configured, Gradle will run in incubating parallel mode.
# This option should only be used with decoupled projects. More details, visit
diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar
index 62d4c053550b9..249e5832f090a 100644
Binary files a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar and b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
index 51d930a381f3a..012d6d90445b4 100644
--- a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
+++ b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionSha256Sum=7faa7198769f872826c8ef4f1450f839ec27f0b4d5d1e51bade63667cbccd205
-distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.3-bin.zip
+distributionSha256Sum=cb87f222c5585bd46838ad4db78463a5c5f3d336e5e2b98dc7c0c586527351c2
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
diff --git a/js/react_native/android/gradlew b/js/react_native/android/gradlew
index fbd7c515832da..a69d9cb6c2065 100755
--- a/js/react_native/android/gradlew
+++ b/js/react_native/android/gradlew
@@ -1,7 +1,7 @@
-#!/usr/bin/env sh
+#!/bin/sh
#
-# Copyright 2015 the original author or authors.
+# Copyright © 2015-2021 the original authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -17,67 +17,101 @@
#
##############################################################################
-##
-## Gradle start up script for UN*X
-##
+#
+# Gradle start up script for POSIX generated by Gradle.
+#
+# Important for running:
+#
+# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+# noncompliant, but you have some other compliant shell such as ksh or
+# bash, then to run this script, type that shell name before the whole
+# command line, like:
+#
+# ksh Gradle
+#
+# Busybox and similar reduced shells will NOT work, because this script
+# requires all of these POSIX shell features:
+# * functions;
+# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+# * compound commands having a testable exit status, especially «case»;
+# * various built-in commands including «command», «set», and «ulimit».
+#
+# Important for patching:
+#
+# (2) This script targets any POSIX shell, so it avoids extensions provided
+# by Bash, Ksh, etc; in particular arrays are avoided.
+#
+# The "traditional" practice of packing multiple parameters into a
+# space-separated string is a well documented source of bugs and security
+# problems, so this is (mostly) avoided, by progressively accumulating
+# options in "$@", and eventually passing that to Java.
+#
+# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+# see the in-line comments for details.
+#
+# There are tweaks for specific operating systems such as AIX, CygWin,
+# Darwin, MinGW, and NonStop.
+#
+# (3) This script is generated from the Groovy template
+# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+# within the Gradle project.
+#
+# You can find Gradle at https://github.com/gradle/gradle/.
+#
##############################################################################
# Attempt to set APP_HOME
+
# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
- ls=`ls -ld "$PRG"`
- link=`expr "$ls" : '.*-> \(.*\)$'`
- if expr "$link" : '/.*' > /dev/null; then
- PRG="$link"
- else
- PRG=`dirname "$PRG"`"/$link"
- fi
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+ APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
+ [ -h "$app_path" ]
+do
+ ls=$( ls -ld "$app_path" )
+ link=${ls#*' -> '}
+ case $link in #(
+ /*) app_path=$link ;; #(
+ *) app_path=$APP_HOME$link ;;
+ esac
done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
+APP_BASE_NAME=${0##*/}
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
+MAX_FD=maximum
warn () {
echo "$*"
-}
+} >&2
die () {
echo
echo "$*"
echo
exit 1
-}
+} >&2
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
-case "`uname`" in
- CYGWIN* )
- cygwin=true
- ;;
- Darwin* )
- darwin=true
- ;;
- MINGW* )
- msys=true
- ;;
- NONSTOP* )
- nonstop=true
- ;;
+case "$( uname )" in #(
+ CYGWIN* ) cygwin=true ;; #(
+ Darwin* ) darwin=true ;; #(
+ MSYS* | MINGW* ) msys=true ;; #(
+ NONSTOP* ) nonstop=true ;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
@@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
- JAVACMD="$JAVA_HOME/jre/sh/java"
+ JAVACMD=$JAVA_HOME/jre/sh/java
else
- JAVACMD="$JAVA_HOME/bin/java"
+ JAVACMD=$JAVA_HOME/bin/java
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
@@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
- JAVACMD="java"
+ JAVACMD=java
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
@@ -106,80 +140,101 @@ location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
- MAX_FD_LIMIT=`ulimit -H -n`
- if [ $? -eq 0 ] ; then
- if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
- MAX_FD="$MAX_FD_LIMIT"
- fi
- ulimit -n $MAX_FD
- if [ $? -ne 0 ] ; then
- warn "Could not set maximum file descriptor limit: $MAX_FD"
- fi
- else
- warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
- fi
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+ case $MAX_FD in #(
+ max*)
+ MAX_FD=$( ulimit -H -n ) ||
+ warn "Could not query maximum file descriptor limit"
+ esac
+ case $MAX_FD in #(
+ '' | soft) :;; #(
+ *)
+ ulimit -n "$MAX_FD" ||
+ warn "Could not set maximum file descriptor limit to $MAX_FD"
+ esac
fi
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
- GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
+# Collect all arguments for the java command, stacking in reverse order:
+# * args from the command line
+# * the main class name
+# * -classpath
+# * -D...appname settings
+# * --module-path (only if needed)
+# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
# For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
- APP_HOME=`cygpath --path --mixed "$APP_HOME"`
- CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
- JAVACMD=`cygpath --unix "$JAVACMD"`
-
- # We build the pattern for arguments to be converted via cygpath
- ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
- SEP=""
- for dir in $ROOTDIRSRAW ; do
- ROOTDIRS="$ROOTDIRS$SEP$dir"
- SEP="|"
- done
- OURCYGPATTERN="(^($ROOTDIRS))"
- # Add a user-defined pattern to the cygpath arguments
- if [ "$GRADLE_CYGPATTERN" != "" ] ; then
- OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
- fi
+if "$cygwin" || "$msys" ; then
+ APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+ CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+ JAVACMD=$( cygpath --unix "$JAVACMD" )
+
# Now convert the arguments - kludge to limit ourselves to /bin/sh
- i=0
- for arg in "$@" ; do
- CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
- CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
-
- if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
- eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
- else
- eval `echo args$i`="\"$arg\""
+ for arg do
+ if
+ case $arg in #(
+ -*) false ;; # don't mess with options #(
+ /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
+ [ -e "$t" ] ;; #(
+ *) false ;;
+ esac
+ then
+ arg=$( cygpath --path --ignore --mixed "$arg" )
fi
- i=`expr $i + 1`
+ # Roll the args list around exactly as many times as the number of
+ # args, so each arg winds up back in the position where it started, but
+ # possibly modified.
+ #
+ # NB: a `for` loop captures its iteration list before it begins, so
+ # changing the positional parameters here affects neither the number of
+ # iterations, nor the values presented in `arg`.
+ shift # remove old arg
+ set -- "$@" "$arg" # push replacement arg
done
- case $i in
- 0) set -- ;;
- 1) set -- "$args0" ;;
- 2) set -- "$args0" "$args1" ;;
- 3) set -- "$args0" "$args1" "$args2" ;;
- 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
- 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
- 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
- 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
- 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
- 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
- esac
fi
-# Escape application args
-save () {
- for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
- echo " "
-}
-APP_ARGS=`save "$@"`
+# Collect all arguments for the java command;
+# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+# shell script including quotes and variable substitutions, so put them in
+# double quotes to make sure that they get re-expanded; and
+# * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+ "-Dorg.gradle.appname=$APP_BASE_NAME" \
+ -classpath "$CLASSPATH" \
+ org.gradle.wrapper.GradleWrapperMain \
+ "$@"
+
+# Stop when "xargs" is not available.
+if ! command -v xargs >/dev/null 2>&1
+then
+ die "xargs is not available"
+fi
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+# set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+eval "set -- $(
+ printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+ xargs -n1 |
+ sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+ tr '\n' ' '
+ )" '"$@"'
exec "$JAVACMD" "$@"
diff --git a/js/react_native/android/gradlew.bat b/js/react_native/android/gradlew.bat
index 5093609d512a9..f127cfd49d402 100644
--- a/js/react_native/android/gradlew.bat
+++ b/js/react_native/android/gradlew.bat
@@ -14,7 +14,7 @@
@rem limitations under the License.
@rem
-@if "%DEBUG%" == "" @echo off
+@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@@ -25,7 +25,7 @@
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
+if "%DIRNAME%"=="" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
+if %ERRORLEVEL% equ 0 goto execute
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
@@ -54,7 +54,7 @@ goto fail
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-if exist "%JAVA_EXE%" goto init
+if exist "%JAVA_EXE%" goto execute
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
@@ -64,21 +64,6 @@ echo location of your Java installation.
goto fail
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
:execute
@rem Setup the command line
@@ -86,17 +71,19 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
:end
@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
+if %ERRORLEVEL% equ 0 goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
-if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
+set EXIT_CODE=%ERRORLEVEL%
+if %EXIT_CODE% equ 0 set EXIT_CODE=1
+if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
+exit /b %EXIT_CODE%
:mainEnd
if "%OS%"=="Windows_NT" endlocal
diff --git a/js/react_native/e2e/android/app/build.gradle b/js/react_native/e2e/android/app/build.gradle
index 8a84b0d5065a8..526259e3f8d8f 100644
--- a/js/react_native/e2e/android/app/build.gradle
+++ b/js/react_native/e2e/android/app/build.gradle
@@ -193,7 +193,7 @@ dependencies {
implementation "com.facebook.react:react-native:+" // From node_modules
implementation "androidx.swiperefreshlayout:swiperefreshlayout:1.0.0"
- implementation 'androidx.test.ext:junit:1.1.3'
+ implementation 'androidx.test.ext:junit:1.1.5'
debugImplementation("com.facebook.flipper:flipper:${FLIPPER_VERSION}") {
exclude group:'com.facebook.fbjni'
}
@@ -213,9 +213,9 @@ dependencies {
implementation jscFlavor
}
- androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
- androidTestImplementation 'androidx.test:runner:1.4.0'
- androidTestImplementation 'androidx.test:rules:1.4.0'
+ androidTestImplementation "androidx.test.espresso:espresso-core:3.5.0"
+ androidTestImplementation "androidx.test:runner:1.5.2"
+ androidTestImplementation "androidx.test:rules:1.5.0"
implementation project(':onnxruntime-react-native')
// specify ORT dependency here so it can be found in libs flatDir repository
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 5f329b5ff8b39..5c8748d75c2bc 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -50,12 +50,14 @@ Do not modify directly.*
| Gather | ai.onnx(1-10,11-12,13+) | |
| GatherBlockQuantized | com.microsoft(1+) | |
| GatherElements | ai.onnx(11-12,13+) | |
+| GatherND | ai.onnx(11,12,13+) | |
| Gelu | ai.onnx(20+); com.microsoft(1+) | |
| Gemm | ai.onnx(7-8,9-10,11-12,13+) | |
| GlobalAveragePool | ai.onnx(1+); com.ms.internal.nhwc(1+) | |
| GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) | |
| Greater | ai.onnx(7-8,9-12,13+) | |
| GreaterOrEqual | ai.onnx(12-15,16+) | |
+| GridSample | ai.onnx(16-19); com.ms.internal.nhwc(16-19) | |
| GroupQueryAttention | com.microsoft(1+) | |
| HardSigmoid | ai.onnx(6+) | |
| If | ai.onnx(1-10,11-12,13-18,19-20,21+) | |
@@ -93,6 +95,7 @@ Do not modify directly.*
| Reshape | ai.onnx(5-12,13,14-18,19-20,21+) | no GPU kernel |
| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
| RotaryEmbedding | com.microsoft(1+) | |
+| ScatterND | ai.onnx(11-12,13-15,16-17,18+) | |
| Shape | ai.onnx(1-12,13-14,15-18,19-20,21+) | no GPU kernel; an ORT warning is generated - need to fix |
| Sigmoid | ai.onnx(6-12,13+) | |
| SimplifiedLayerNormalization | ai.onnx(1+) | |
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index b8c3b2ec8ec57..e0012e70a7dec 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -25,10 +25,11 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
| Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight) |
| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU backend only supports default dilations and group |
| Cos | ai.onnx(7+) | cos | ✓ | ✓ | |
-| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | |
+| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant |
| Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✗ | ✓ | |
| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
+| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | |
| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | |
| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | |
@@ -57,6 +58,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✓ | ✓ | |
| Log | ai.onnx(7-12, 13+) | log | ✓ | ✓ | |
| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
+| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | ✓ | ✓ | |
| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | ✓ | ✓ | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | |
| Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | |
@@ -83,7 +85,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant |
| Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, coordinate_transformation_mode == 'half_pixel', exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | ✗ | ✓ | Only supports 'reduction' == 'none' |
| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | ✗ | ✓ | Only supports 'reduction' == 'none' |
| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
@@ -93,7 +95,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
| Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | |
| Softsign | ai.onnx(7+) | softsign | ✓ | ✓ | |
| Sin | ai.onnx(7+) | sin | ✓ | ✓ | |
-| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant, only supports 'steps' value 1 |
+| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | |
| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant |
| Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | |
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 371e19e00d95e..7c5f99f1a4c67 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -13,6 +13,7 @@ import { ProgramManager } from './webgpu/program-manager';
import {
AdapterInfo,
ComputeContext,
+ DeviceInfo,
GpuArchitecture,
GpuData,
GpuVendor,
@@ -134,6 +135,26 @@ class AdapterInfoImpl implements AdapterInfo {
}
}
+class DeviceInfoImpl implements DeviceInfo {
+ readonly subgroupsSupported: boolean;
+ readonly subgroupsF16Supported: boolean;
+ readonly subgroupSizeRange?: readonly [number, number];
+
+ constructor(device: GPUDevice) {
+ this.subgroupsSupported = device.features.has('subgroups' as GPUFeatureName);
+ this.subgroupsF16Supported = device.features.has('subgroups' as GPUFeatureName);
+ // Currently subgroups feature is still experimental and size attributes are not in the WebGPU IDL, so we have to
+ // workaround the IDL type checks.
+ // TODO: clean this after subgroups feature is settled in IDL.
+ const deviceSubgroupsLimits = device.limits as { minSubgroupSize?: number; maxSubgroupSize?: number };
+ if (!this.subgroupsSupported || !deviceSubgroupsLimits.minSubgroupSize || !deviceSubgroupsLimits.maxSubgroupSize) {
+ this.subgroupSizeRange = undefined;
+ } else {
+ this.subgroupSizeRange = [deviceSubgroupsLimits.minSubgroupSize, deviceSubgroupsLimits.maxSubgroupSize];
+ }
+ }
+}
+
/**
* this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
* the first parameter so that it is stored for future use.
@@ -141,6 +162,7 @@ class AdapterInfoImpl implements AdapterInfo {
export class WebGpuBackend {
adapterInfo: AdapterInfoImpl;
device: GPUDevice;
+ deviceInfo: DeviceInfoImpl;
/**
* an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
*/
@@ -243,19 +265,25 @@ export class WebGpuBackend {
requiredFeatures,
};
- if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
- requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
- } else if (adapter.features.has('timestamp-query')) {
- requiredFeatures.push('timestamp-query');
+ // Try requiring WebGPU features
+ const requireFeatureIfAvailable = (feature: GPUFeatureName) =>
+ adapter.features.has(feature) && requiredFeatures.push(feature) && true;
+ // Try chromium-experimental-timestamp-query-inside-passes and fallback to timestamp-query
+ if (!requireFeatureIfAvailable('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName)) {
+ requireFeatureIfAvailable('timestamp-query');
}
- if (adapter.features.has('shader-f16')) {
- requiredFeatures.push('shader-f16');
+ requireFeatureIfAvailable('shader-f16');
+ // Try subgroups
+ if (requireFeatureIfAvailable('subgroups' as GPUFeatureName)) {
+ // If subgroups feature is available, also try subgroups-f16
+ requireFeatureIfAvailable('subgroups-f16' as GPUFeatureName);
}
if (adapter.features.has('chromium-experimental-subgroups')) {
requiredFeatures.push('chromium-experimental-subgroups' as GPUFeatureName);
}
this.device = await adapter.requestDevice(deviceDescriptor);
+ this.deviceInfo = new DeviceInfoImpl(this.device);
this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
this.gpuDataManager = createGpuDataManager(this);
this.programManager = new ProgramManager(this);
diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts
index d13c663651127..b302354c46eeb 100644
--- a/js/web/lib/wasm/jsep/backend-webnn.ts
+++ b/js/web/lib/wasm/jsep/backend-webnn.ts
@@ -226,7 +226,7 @@ export class WebNNBackend {
return id;
}
- // Register WebNN Constant operands from external data.
+ // Register a WebNN Constant operand from external data.
public registerMLConstant(
externalFilePath: string,
dataOffset: number,
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index fddc061cd775a..48bd3ef2bc36f 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -11,7 +11,13 @@ import { WebGpuBackend } from './backend-webgpu';
import { LOG_DEBUG } from './log';
import { TensorView } from './tensor-view';
import { ShapeUtil } from './util';
-import { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types';
+import {
+ AdapterInfo,
+ ComputeContext,
+ ComputeContextInputsOutputsMapping,
+ DeviceInfo,
+ ProgramInfo,
+} from './webgpu/types';
import { WebNNBackend } from './backend-webnn';
/* eslint-disable no-bitwise */
@@ -70,6 +76,7 @@ class TensorViewImpl implements TensorView {
class ComputeContextImpl implements ComputeContext {
readonly adapterInfo: AdapterInfo;
+ readonly deviceInfo: DeviceInfo;
readonly opKernelContext: number;
readonly inputs: readonly TensorView[];
readonly outputCount: number;
@@ -87,6 +94,7 @@ class ComputeContextImpl implements ComputeContext {
contextDataOffset: number,
) {
this.adapterInfo = backend.adapterInfo;
+ this.deviceInfo = backend.deviceInfo;
// extract context data
const ptrSize = module.PTR_SIZE;
@@ -112,18 +120,6 @@ class ComputeContextImpl implements ComputeContext {
this.inputs = inputs;
}
- getMaxComputeWorkgroupSizes(): [number, number, number] {
- return [
- this.backend.device.limits.maxComputeWorkgroupSizeX,
- this.backend.device.limits.maxComputeWorkgroupSizeY,
- this.backend.device.limits.maxComputeWorkgroupSizeZ,
- ];
- }
-
- getMaxComputeWorkgroupStoragesize(): number {
- return this.backend.device.limits.maxComputeWorkgroupStorageSize;
- }
-
compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[] {
// prepare inputs. inputs should always be valid data.
const mappedInputs =
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 09c786daa3fcd..6c7afbc7365bb 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -16,9 +16,11 @@ import { einsum, parseEinsumAttributes } from './ops/einsum';
import { expand } from './ops/expand';
import { fastGelu } from './ops/fast-gelu';
import { gather, parseGatherAttributes } from './ops/gather';
+import { gatherND, parseGatherNDAttributes } from './ops/gather-nd';
import { gatherBlockQuantized, parseGatherBlockQuantizedAttributes } from './ops/gather-block-quantized';
import { gatherElements, parseGatherElementsAttributes } from './ops/gather-elements';
import { gemm, parseGemmAttributes } from './ops/gemm';
+import { gridSample, parseGridSampleAttributes } from './ops/grid-sample';
import { groupQueryAttention } from './ops/group-query-attention';
import { instanceNorm } from './ops/instance-norm';
import { layerNorm } from './ops/layer-norm';
@@ -29,6 +31,7 @@ import { pad } from './ops/pad';
import * as pool from './ops/pool';
import { dequantizeLinear, parseDequantizeLinearAttributes } from './ops/quantize-linear';
import { range } from './ops/range';
+import { scatterND, parseScatterNDAttributes } from './ops/scatter-nd';
import {
reduceL1,
reduceL2,
@@ -98,12 +101,14 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new
['Gather', [gather, parseGatherAttributes]],
['GatherElements', [gatherElements, parseGatherElementsAttributes]],
['GatherBlockQuantized', [gatherBlockQuantized, parseGatherBlockQuantizedAttributes]],
+ ['GatherND', [gatherND, parseGatherNDAttributes]],
['Gelu', [unaryOps.gelu]],
['Gemm', [gemm, parseGemmAttributes]],
['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
['Greater', [binaryOps.greater]],
['GreaterOrEqual', [binaryOps.greaterOrEqual]],
+ ['GridSample', [gridSample, parseGridSampleAttributes]],
['GroupQueryAttention', [groupQueryAttention]],
['HardSigmoid', [unaryOps.hardSigmoid, unaryOps.parseHardSigmoidAttributes]],
['InstanceNormalization', [instanceNorm]],
@@ -138,6 +143,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new
['Relu', [unaryOps.relu]],
['Resize', [resize, parseResizeAttributes]],
['RotaryEmbedding', [rotaryEmbedding]],
+ ['ScatterND', [scatterND, parseScatterNDAttributes]],
['Sigmoid', [unaryOps.sigmoid]],
['Sin', [unaryOps.sin]],
['Sinh', [unaryOps.sinh]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 2a8756e435b8e..cb1f30ecdd1f4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -29,229 +29,27 @@ import {
ShaderHelper,
tensorTypeToWsglStorageType,
UniformsArrayType,
+ getMaxComponents,
} from '../common';
import { ConvTransposeAttributes } from '../conv-transpose';
-const createConvTranspose2DOpProgramShaderSource = (
- shaderHelper: ShaderHelper,
- inputs: readonly TensorView[],
- outputShape: readonly number[],
- hasBias: boolean,
- is1DimensionDispatch: boolean,
- isVec4 = false,
- dataType: string,
- uniforms: UniformsArrayType,
- isChannelsLast = false,
-): string => {
- const rowDim = isChannelsLast ? 1 : 2;
- const colDim = isChannelsLast ? 2 : 3;
- const channelDim = isChannelsLast ? 3 : 1;
- const workPerThread = isVec4 ? 2 : 1;
-
- let declareFunctions = `
- fn setOutputAtIndex(flatIndex : u32, value : ${isVec4 ? `vec4<${dataType}>` : dataType}) {
- result[flatIndex] = ${isVec4 ? `vec4<${dataType}>` : dataType}(value);
- }`;
- if (hasBias) {
- declareFunctions += `
- fn getBiasByOutputCoords(coords : vec4) -> ${isVec4 ? `vec4<${dataType}>` : dataType} {
- return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
- }`;
- }
- const components = isVec4 ? 4 : 1;
- const w = inputVariable('W', inputs[1].dataType, inputs[1].dims.length, components);
- const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims.length, components);
- const inputVariables = [dy, w];
- if (hasBias) {
- inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]].length, components));
- }
- const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
-
- const codeSnippet4 = `{
- let batch: u32 = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} / uniforms.result_shape[1];
- let r = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} % uniforms.result_shape[1];
- let c = ${is1DimensionDispatch ? 'global_id.y' : 'workgroup_id.y'} * ${workPerThread};
- let d1: u32 = ${is1DimensionDispatch ? 'global_id.x' : 'workgroup_id.x'} * 4;
-
- let dyCorner = vec2(i32(r), i32(c)) - vec2(uniforms.pads);
-
- // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
- // ? = to be determined. : = across all values in that axis.
- var dotProd: array, ${workPerThread}>;
- for (var i = 0; i < ${workPerThread}; i++) {
- dotProd[i] = vec4<${dataType}>(0.0);
- }
- for (var wR: u32 = 0; wR < uniforms.filter_dims[0]; wR = wR + 1) {
- var dyR = (${dataType}(dyCorner.x) + ${dataType}(wR)) / ${dataType}(uniforms.strides.x);
- let wRPerm = uniforms.filter_dims[0] - 1 - wR;
- if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[1]) ||
- fract(dyR) > 0.0 || wRPerm < 0) {
- continue;
- }
- let idyR: u32 = u32(dyR);
-
- for (var wC: u32 = 0; wC < uniforms.filter_dims[1]; wC = wC + 1) {
- let dyC = (${dataType}(dyCorner.y) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
- let dyC2 = (${dataType}(dyCorner.y) + 1.0 + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
- let wCPerm = uniforms.filter_dims[1] - 1 - wC;
- if (wCPerm < 0) {
- continue;
- }
- var bDyCVal = true;
- var bDyCVal2 = true;
- if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[2]) ||
- fract(dyC) > 0.0) {
- bDyCVal = false;
- }
- if (dyC2 < 0.0 || dyC2 >= ${dataType}(uniforms.Dy_shape[2]) ||
- fract(dyC2) > 0.0) {
- bDyCVal2 = false;
- }
-
- let idyC: u32 = u32(dyC);
- let idyC2: u32 = u32(dyC2);
- if (bDyCVal && bDyCVal2) {
- let d2Length = uniforms.Dy_shape[3];
- for (var d2 :u32 = 0; d2 < d2Length; d2 = d2 + 4) {
- let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
- let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
- let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
- let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
- var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
- let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
- dot(xValue, wValue1),
- dot(xValue, wValue2),
- dot(xValue, wValue3));
- dotProd[0] = dotProd[0] + tmpval;
-
- xValue = ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
-
- dotProd[1] = dotProd[1] + vec4<${dataType}>(dot(xValue, wValue0),
- dot(xValue, wValue1),
- dot(xValue, wValue2),
- dot(xValue, wValue3));
- }
- } else if (bDyCVal) {
- let d2Length = uniforms.Dy_shape[${channelDim}];
- for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
- let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
- let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
- let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
- let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
- var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
- let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
- dot(xValue, wValue1),
- dot(xValue, wValue2),
- dot(xValue, wValue3));
- dotProd[0] = dotProd[0] + tmpval;
- }
- } else if (bDyCVal2) {
- let d2Length = uniforms.Dy_shape[3];
- for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
- let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
- let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
- let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
- let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
- var xValue = ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
- let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
- dot(xValue, wValue1),
- dot(xValue, wValue2),
- dot(xValue, wValue3));
- dotProd[1] = dotProd[1] + tmpval;
- }
- }
- }
- }
-
- for (var i: u32 = 0; i < ${workPerThread}; i = i + 1) {
- let value = dotProd[i] + ${hasBias ? 'bias[c+i]' : `vec4<${dataType}>(0.0)`};
- ${output.set('batch', 'r', 'c + i', 'd1', 'value')};
- }
- }`;
- const codeSnippet = `
- let outputIndices = ${output.offsetToIndices('global_idx')};
- let batch = ${output.indicesGet('outputIndices', 0)};
- let d1 = ${output.indicesGet('outputIndices', channelDim)};
- let r = ${output.indicesGet('outputIndices', rowDim)};
- let c = ${output.indicesGet('outputIndices', colDim)};
- let dyCorner = vec2(i32(r), i32(c)) - uniforms.pads;
- let dyRCorner = dyCorner.x;
- let dyCCorner = dyCorner.y;
- let groupId = d1 / uniforms.output_channels_per_group;
- let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
- // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
- // ? = to be determined. : = across all values in that axis.
- var dotProd = ${dataType}(0.0);
- for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
- if (wR % uniforms.dilations.x != 0) {
- continue;
- }
- let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(uniforms.strides[0]);
- let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
- if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[${rowDim}]) || fract(dyR) > 0.0 ||
- wRPerm < 0) {
- continue;
- }
- let idyR: u32 = u32(dyR);
-
- for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
- if (wC % uniforms.dilations.y != 0) {
- continue;
- }
- let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
- let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
- if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[${colDim}]) ||
- fract(dyC) > 0.0 || wCPerm < 0) {
- continue;
- }
- let idyC: u32 = u32(dyC);
- var inputChannel = groupId * uniforms.input_channels_per_group;
- for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + 1) {
- let xValue = ${
- isChannelsLast
- ? dy.get('batch', 'idyR', 'idyC', 'inputChannel')
- : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
- };
- let wValue = ${w.get('inputChannel', 'wOutChannel', 'u32(wRPerm)', 'u32(wCPerm)')};
- dotProd = dotProd + xValue * wValue;
- inputChannel = inputChannel + 1;
- }
- }
- }
- let value = dotProd + ${hasBias ? 'bias[d1]' : `${dataType}(0.0)`};
- ${output.setByOffset('global_idx', 'value')};
- `;
-
- return `
- ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
- ${declareFunctions}
-
- ${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')};
- ${isVec4 ? codeSnippet4 : codeSnippet}}`;
-};
-
export const createConvTranspose2DProgramInfo = (
inputs: readonly TensorView[],
attributes: ConvTransposeAttributes,
squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
): ProgramInfo => {
const hasBias = inputs.length > 2;
- // const isChannelsLast = attributes.format === 'NHWC';
const outputShape = attributes.outputShape;
- const outputSize = ShapeUtil.size(outputShape);
-
- // const inChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
- // TODO Enable isVec4 for performance
- // Disabled due to weight matrix layout issue
- // const isVec4 = attributes.group === 1 && isChannelsLast && inChannels % 4 === 0 && outChannels % 4 === 0;
+ const isChannelsLast = attributes.format === 'NHWC';
+ const group = attributes.group;
+ const wShape = inputs[1].dims;
+ const inputChannelsPerGroup = wShape[2] / group;
+ const outputChannelsPerGroup = wShape[3];
+ const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
+ const outputSize = ShapeUtil.size(outputShape) / components;
const dispatch = [Math.ceil(outputSize / 64), 1, 1];
LOG_DEBUG('verbose', () => `[conv2d_backprop_webgpu] dispatch = ${dispatch}`);
- const isChannelsLast = attributes.format === 'NHWC';
const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
const strides = [attributes.strides[0], attributes.strides[1]];
const filterDims = [attributes.kernelShape[isChannelsLast ? 1 : 2], attributes.kernelShape[isChannelsLast ? 2 : 3]];
@@ -268,15 +66,9 @@ export const createConvTranspose2DProgramInfo = (
];
const pads = [
effectiveFilterDims[0] - 1 - Math.floor((attributes.pads[0] + attributes.pads[2]) / 2),
- effectiveFilterDims[1] - 1 - Math.floor(attributes.pads[1] + attributes.pads[3]) / 2,
+ effectiveFilterDims[1] - 1 - Math.floor((attributes.pads[1] + attributes.pads[3]) / 2),
];
- const isVec4 = false;
- const group = attributes.group;
- const wShape = inputs[1].dims;
- const inputChannelsPerGroup = wShape[0] / group;
- const outputChannelsPerGroup = wShape[1];
-
const programUniforms: ProgramUniform[] = [
{ type: DataType.uint32, data: outputSize },
{ type: DataType.uint32, data: strides },
@@ -294,7 +86,6 @@ export const createConvTranspose2DProgramInfo = (
}
programUniforms.push(...createTensorShapeVariables(outputShape));
- const is1DimensionDispatch = dispatch[1] === 1 && dispatch[2] === 1;
const getShaderSource = (shaderHelper: ShaderHelper) => {
const uniforms: UniformsArrayType = [
{ name: 'output_size', type: 'u32' },
@@ -307,21 +98,83 @@ export const createConvTranspose2DProgramInfo = (
{ name: 'output_channels_per_group', type: 'u32' },
];
const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
- return `${createConvTranspose2DOpProgramShaderSource(
- shaderHelper,
- inputs,
- outputShape,
- hasBias,
- is1DimensionDispatch,
- isVec4,
- dataType,
- uniforms,
- isChannelsLast,
- )}`;
+ const rowDim = isChannelsLast ? 1 : 2;
+ const colDim = isChannelsLast ? 2 : 3;
+ const channelDim = isChannelsLast ? 3 : 1;
+
+ const w = inputVariable('W', inputs[1].dataType, inputs[1].dims.length, components);
+ const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims.length);
+ const inputVariables = [dy, w];
+ if (hasBias) {
+ inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]].length, components));
+ }
+ const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
+
+ const codeSnippet = `
+ let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
+ let batch = ${output.indicesGet('outputIndices', 0)};
+ let d1 = ${output.indicesGet('outputIndices', channelDim)};
+ let r = ${output.indicesGet('outputIndices', rowDim)};
+ let c = ${output.indicesGet('outputIndices', colDim)};
+ let dyCorner = vec2(i32(r), i32(c)) - uniforms.pads;
+ let dyRCorner = dyCorner.x;
+ let dyCCorner = dyCorner.y;
+ let groupId = d1 / uniforms.output_channels_per_group;
+ let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
+ // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
+ // ? = to be determined. : = across all values in that axis.
+ var dotProd = ${output.type.value}(0.0);
+ for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+ if (wR % uniforms.dilations.x != 0) {
+ continue;
+ }
+ let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(uniforms.strides[0]);
+ let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
+ if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[${rowDim}]) || fract(dyR) > 0.0 ||
+ wRPerm < 0) {
+ continue;
+ }
+ let idyR: u32 = u32(dyR);
+
+ for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+ if (wC % uniforms.dilations.y != 0) {
+ continue;
+ }
+ let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
+ let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
+ if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[${colDim}]) ||
+ fract(dyC) > 0.0 || wCPerm < 0) {
+ continue;
+ }
+ let idyC: u32 = u32(dyC);
+ var inputChannel = groupId * uniforms.input_channels_per_group;
+ for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + 1) {
+ let xValue = ${
+ isChannelsLast
+ ? dy.get('batch', 'idyR', 'idyC', 'inputChannel')
+ : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+ };
+ let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+ let wValue = ${w.getByOffset(`w_offset / ${components}`)};
+ dotProd = dotProd + xValue * wValue;
+ inputChannel = inputChannel + 1;
+ }
+ }
+ }
+ let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''};
+ ${output.setByOffset('global_idx', 'value')};
+ `;
+
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')};
+ ${codeSnippet}}`;
};
+
return {
name: 'ConvTranspose2D',
- shaderCache: { hint: `${attributes.cacheKey};`, inputDependencies },
+ shaderCache: { hint: `${attributes.cacheKey};${components}`, inputDependencies },
getRunData: () => ({
dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
outputs: [
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index f0287529ca08b..c6341f94cf191 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -25,7 +25,6 @@ import { ShapeUtil } from '../../../util';
import { ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform } from '../../types';
import {
createTensorShapeVariables,
- getBroadcastDims,
IndicesHelper,
inputVariable,
internalVariable,
@@ -40,6 +39,7 @@ import {
getActivationSnippet,
InternalActivationAttributes,
} from '../fuse-utils';
+import { convertOutputBatchIndicesToInputBatchIndices } from '../matmul-shaders';
import { typeSnippet } from './activation_util';
@@ -373,42 +373,11 @@ const matMulReadWriteFnSource = (
hasBias: boolean,
applyActivation: string,
variables: IndicesHelper[],
- batchShapes: Array,
isChannelsLast = false,
): string => {
- const [batchAShape, batchBShape, batchShape] = batchShapes;
const [batchVariable, aVariable, bVariable, outputVariable] = variables;
- const broadCastADims = getBroadcastDims(batchAShape, batchShape);
- const broadCastBDims = getBroadcastDims(batchBShape, batchShape);
const dataType = tensorTypeToWsglStorageType(variables[0].type.tensor);
- const getAIndices = () => {
- const aRank = aVariable.rank;
- const batchRank = batchVariable.rank;
- let resStr = `var aIndices: ${aVariable.type.indices};`;
- for (let i = aRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
- resStr += `\naIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
- }
- broadCastADims.forEach((i) => {
- resStr += `\naIndices[${i}] = 0;`;
- });
- resStr += `\naIndices[${aRank - 2}] = u32(row);
- aIndices[${aRank - 1}] = u32(colIn);`;
- return resStr;
- };
- const getBIndices = () => {
- const bRank = bVariable.rank;
- const batchRank = batchVariable.rank;
- let resStr = `var bIndices: ${bVariable.type.indices};`;
- for (let i = bRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
- resStr += `\nbIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
- }
- broadCastBDims.forEach((i) => {
- resStr += `\nbIndices[${i}] = 0;`;
- });
- resStr += `\nbIndices[${bRank - 2}] = u32(row);
- bIndices[${bRank - 1}] = u32(colIn);`;
- return resStr;
- };
+
const source = `
fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet(
component,
@@ -418,7 +387,16 @@ const matMulReadWriteFnSource = (
let col = colIn * ${component};
if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
{
- ${getAIndices()}
+ var aIndices: ${aVariable.type.indices};
+ ${convertOutputBatchIndicesToInputBatchIndices(
+ 'aIndices',
+ aVariable,
+ aVariable.rank - 2,
+ batchVariable.rank,
+ 'batchIndices',
+ )}
+ ${aVariable.indicesSet('aIndices', aVariable.rank - 2, 'u32(row)')}
+ ${aVariable.indicesSet('aIndices', aVariable.rank - 1, 'u32(colIn)')}
value = ${aVariable.getByIndices('aIndices')};
}
return value;
@@ -432,7 +410,16 @@ const matMulReadWriteFnSource = (
let col = colIn * ${component};
if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
{
- ${getBIndices()}
+ var bIndices: ${bVariable.type.indices};
+ ${convertOutputBatchIndicesToInputBatchIndices(
+ 'bIndices',
+ bVariable,
+ bVariable.rank - 2,
+ batchVariable.rank,
+ 'batchIndices',
+ )}
+ ${bVariable.indicesSet('bIndices', bVariable.rank - 2, 'u32(row)')}
+ ${bVariable.indicesSet('bIndices', bVariable.rank - 1, 'u32(colIn)')}
value = ${bVariable.getByIndices('bIndices')};
}
return value;
@@ -532,7 +519,6 @@ export const createMatmulProgramInfo = (
hasBias,
applyActivation,
[batchDims, A, B, output],
- [outerDimsA, outerDimsB, outerDims],
isChannelsLast,
);
return `
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 793f26fe901e3..0b9173403cd7d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -195,7 +195,7 @@ export interface IndicesHelper {
/**
* whether the helper is for an input, an output or an internal variable.
*/
- readonly usage: 'input' | 'output' | 'internal';
+ readonly usage: 'input' | 'output' | 'atomicOutput' | 'internal';
/**
* the rank of the input or output.
@@ -733,6 +733,20 @@ export const outputVariable = (
components: 1 | 2 | 3 | 4 = 1,
): IndicesHelper => createIndicesHelper(name, type, shapeOrRank, 'output', components);
+/**
+ * Create a IndicesHelper for an atomic output.
+ *
+ * @param name - the name of the output.
+ * @param type - the tensor type of the output.
+ * @param shapeOrRank - the tensor shape or the rank of the output.
+ * @returns an IndicesHelper for the output.
+ */
+export const atomicOutputVariable = (
+ name: string,
+ type: number,
+ shapeOrRank: number | readonly number[],
+): IndicesHelper => createIndicesHelper(name, type, shapeOrRank, 'atomicOutput', 1);
+
/**
* Create a IndicesHelper for an internal variable.
*
@@ -905,9 +919,8 @@ class ShaderHelperImpl implements ShaderHelper {
}
this.variables.push(variable);
this.appendVariableUniforms(variable);
-
const access = variable.usage === 'input' ? 'read' : 'read_write';
- const storageType = variable.type.storage;
+ const storageType = variable.usage === 'atomicOutput' ? `atomic` : variable.type.storage;
return `@group(0) @binding(${bindingIndex}) var ${variable.name}: array<${storageType}>;`;
}
@@ -996,27 +1009,3 @@ class ShaderHelperImpl implements ShaderHelper {
export const createShaderHelper = (dispatchGroup: [number, number, number], limits: GPUSupportedLimits) =>
new ShaderHelperImpl(dispatchGroup, limits);
-
-/**
- * This function comes from https://github.com/tensorflow/tfjs/blob/master/tfjs-core/src/ops/broadcast_util.ts#L18-L40
- * Returns the dimensions in the input shape that are broadcasted to
- * produce the provided output shape.
- *
- * The returned dimensions are 0-indexed and sorted. An example:
- * inShape = [4, 1, 3]
- * outShape = [5, 4, 3, 3]
- * result = [1]. Dimension 1 (2nd dimension of input) gets broadcasted 1 => 3.
- */
-export const getBroadcastDims = (inShape: readonly number[], outShape: readonly number[]): number[] => {
- const inRank = inShape.length;
- const dims: number[] = [];
- for (let i = 0; i < inRank; i++) {
- const dim = inRank - 1 - i;
- const a = inShape[dim] || 1;
- const b = outShape[outShape.length - 1 - i] || 1;
- if (b > 1 && a === 1) {
- dims.unshift(dim);
- }
- }
- return dims;
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index 236f1b09a6c93..3e168ddedac86 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -4,7 +4,6 @@
import { TensorView } from '../../tensor-view';
import { ComputeContext } from '../types';
-import { createConv2DTransposeMatMulProgramInfo } from './3rd-party/conv_backprop_mm_webgpu';
import { createConvTranspose2DProgramInfo } from './3rd-party/conv_backprop_webgpu';
import { ConvAttributes } from './conv';
import { parseInternalActivationAttributes } from './fuse-utils';
@@ -227,41 +226,16 @@ const validateInputs = (inputs: readonly TensorView[], attributes: ConvTranspose
}
};
-// for transposing weight tensor from [C, M/group, KH, KW] to [KH, KW, M/group, C]
-const weightTransposePerm = [2, 3, 1, 0];
-
const convTranspose2d = (
context: ComputeContext,
inputs: readonly TensorView[],
attributes: ConvTransposeAttributes,
+ squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
): void => {
- const adjustedAttributes = getAdjustedConvTransposeAttributes(attributes, inputs);
- const isChannelsLast = attributes.format === 'NHWC';
- const outputShape = adjustedAttributes.outputShape;
- const outChannels = outputShape[isChannelsLast ? 3 : 1];
- const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
- // Switch to naive method when outChannels and inputChannels are very small. It's because that in this case it's
- // not suitable for matmul version since matmul uses tile size 32x32 resulting the underlying execution unit
- // utilization rate is very low.
- if (adjustedAttributes.group !== 1 || (outChannels === 1 && inputChannels === 1)) {
- context.compute(createConvTranspose2DProgramInfo(inputs, adjustedAttributes));
- return;
- }
- const outHeight = outputShape[isChannelsLast ? 1 : 2];
- const outWidth = outputShape[isChannelsLast ? 2 : 3];
- const weightHeight = inputs[1].dims[2];
- const weightWidth = inputs[1].dims[3];
-
- const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
- const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
- const dimInner = weightHeight * weightWidth * inputChannels;
-
- const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
-
// STEP.1: transpose weight
const transposedWeight =
(context.kernelCustomData.wT as TensorView | undefined) ??
- context.compute(createTransposeProgramInfo(inputs[1], weightTransposePerm), {
+ context.compute(createTransposeProgramInfo(inputs[1], [2, 3, 0, 1]), {
inputs: [1],
outputs: [attributes.wIsConst ? -2 : -1],
})[0];
@@ -271,29 +245,12 @@ const convTranspose2d = (
// STEP.2: prepare reshaped inputs
const convTransposeInputs = [inputs[0], transposedWeight];
- const hasBias = inputs.length === 3;
- if (hasBias) {
- if (!isChannelsLast && inputs[2].dims.length === 1) {
- convTransposeInputs.push(inputs[2].reshape([inputs[2].dims[0], 1, 1]));
- } else {
- convTransposeInputs.push(inputs[2]);
- }
+ if (inputs.length === 3) {
+ convTransposeInputs.push(inputs[2]);
}
-
- // STEP.3: compute matmul
- context.compute(
- createConv2DTransposeMatMulProgramInfo(
- convTransposeInputs,
- adjustedAttributes,
- outputShape,
- dimAOuter,
- dimBOuter,
- dimInner,
- hasBias,
- sequentialAccessByThreads,
- ),
- { inputs: convTransposeInputs },
- );
+ context.compute(createConvTranspose2DProgramInfo(convTransposeInputs, attributes, squeezeOutputShapeFunction), {
+ inputs: convTransposeInputs,
+ });
};
const convTranspose1d = (context: ComputeContext, attributes: ConvTransposeAttributes): void => {
@@ -338,12 +295,9 @@ const convTranspose1d = (context: ComputeContext, attributes: ConvTransposeAttri
{ ...attributes, pads, strides, dilations, kernelShape },
inputs,
);
- context.compute(
- createConvTranspose2DProgramInfo(inputs, adjustedAttributes, (outputShape) =>
- isChannelLast
- ? [outputShape[0], outputShape[2], outputShape[3]]
- : [outputShape[0], outputShape[1], outputShape[3]],
- ),
+
+ convTranspose2d(context, inputs, adjustedAttributes, (outputShape) =>
+ isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]],
);
};
@@ -352,6 +306,7 @@ export const convTranspose = (context: ComputeContext, attributes: ConvTranspose
if (context.inputs[0].dims.length === 3) {
convTranspose1d(context, attributes);
} else {
- convTranspose2d(context, context.inputs, attributes);
+ const adjustedAttributes = getAdjustedConvTransposeAttributes(attributes, context.inputs);
+ convTranspose2d(context, context.inputs, adjustedAttributes);
}
};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 4e2bfa9d89924..3691b5ecb602b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -48,11 +48,18 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
const shape = Array.from(inputs[1].getBigInt64Array(), Number);
const outputShape: number[] = calculateOutputShape(inputShape, shape);
const dataType = inputs[0].dataType;
- const components = dataType === DataType.bool ? 4 : 1;
+ const isBoolOrScalar = dataType === DataType.bool || ShapeUtil.size(inputShape) === 1;
+ const iComponents =
+ dataType === DataType.bool ? 4 : inputShape.length > 0 && inputShape[inputShape.length - 1] % 4 === 0 ? 4 : 1;
+ const components = isBoolOrScalar
+ ? 4
+ : outputShape.length > 0 && outputShape[outputShape.length - 1] % 4 === 0
+ ? 4
+ : 1;
const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
const getShaderSource = (shaderHelper: ShaderHelper) => {
- const input = inputVariable('input', dataType, inputShape.length, components);
+ const input = inputVariable('input', dataType, inputShape.length, iComponents);
const output = outputVariable('output', dataType, outputShape.length, components);
let assignment: string;
if (dataType === DataType.bool) {
@@ -74,9 +81,10 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
}`;
} else {
assignment = `
- let outputIndices = ${output.offsetToIndices('global_idx')};
+ let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
let inputOffset = ${input.broadcastedIndicesToOffset('outputIndices', output)};
- ${output.setByOffset('global_idx', input.getByOffset('inputOffset'))}
+ let data = ${output.type.value}(${input.getByOffset(`inputOffset / ${iComponents}`)});
+ ${output.setByOffset('global_idx', 'data')}
}`;
}
return `
@@ -92,7 +100,7 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
];
return {
name: 'Expand',
- shaderCache: { hint: `${outputShape.length}`, inputDependencies: ['rank'] },
+ shaderCache: { hint: `${outputShape.length};${iComponents}${components}`, inputDependencies: ['rank'] },
getShaderSource,
getRunData: () => ({
outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts
new file mode 100644
index 0000000000000..43b51f6e94a66
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts
@@ -0,0 +1,179 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramUniform } from '../types';
+
+import { createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, UniformsArrayType } from './common';
+
+export interface GatherNDAttributes extends AttributeWithCacheKey {
+ readonly batchDims: number;
+}
+
+const computeSliceOffsets = (
+ context: ComputeContext,
+ indicesData: TensorView,
+ sizesFromSliceDimsData: number[],
+ batchDims: number,
+ inputDims: readonly number[],
+ numSlices: number,
+ numSlicesPerBatch: number,
+ inputBatchStride: number,
+ numSliceDims: number,
+) => {
+ const programUniforms: ProgramUniform[] = [
+ { type: DataType.uint32, data: numSlices },
+ { type: DataType.uint32, data: batchDims },
+ { type: DataType.uint32, data: inputDims },
+ { type: DataType.uint32, data: sizesFromSliceDimsData },
+ { type: DataType.uint32, data: numSlicesPerBatch },
+ { type: DataType.uint32, data: inputBatchStride },
+ { type: DataType.uint32, data: numSliceDims },
+ ];
+
+ const outputShape = [numSlices];
+ programUniforms.push(...createTensorShapeVariables(indicesData.dims, outputShape));
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const indices = inputVariable('indices_data', indicesData.dataType, indicesData.dims.length);
+ const output = outputVariable('input_slice_offsets_data', DataType.uint32, 1, 1);
+ const variables = [indices, output];
+ const uniforms: UniformsArrayType = [
+ { name: 'output_size', type: 'u32' },
+ { name: 'batch_dims', type: 'u32' },
+ { name: 'input_dims', type: 'u32', length: inputDims.length },
+ { name: 'sizes_from_slice_dims_data', type: 'u32', length: sizesFromSliceDimsData.length },
+ { name: 'num_slices_per_batch', type: 'u32' },
+ { name: 'input_batch_stride', type: 'u32' },
+ { name: 'num_slice_dims', type: 'u32' },
+ ];
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ let batch_idx = global_idx / uniforms.num_slices_per_batch;
+ let base_offset = batch_idx * uniforms.input_batch_stride;
+
+ let slice_indices_base_offset = global_idx * uniforms.num_slice_dims;
+ var relative_slice_offset = 0;
+ for (var dim_idx = 0u; dim_idx < uniforms.num_slice_dims; dim_idx ++) {
+ var index = i32(indices_data[dim_idx + slice_indices_base_offset].x);
+ let input_dim_idx = uniforms.batch_dims + dim_idx;
+ if (index < 0) {
+ ${
+ inputDims.length === 1
+ ? 'index += i32(uniforms.input_dims);'
+ : 'index += i32(uniforms.input_dims[input_dim_idx]);'
+ }
+ }
+ ${
+ sizesFromSliceDimsData.length === 1
+ ? 'relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data);'
+ : 'relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data[dim_idx]);'
+ }
+ }
+
+ input_slice_offsets_data[global_idx] = base_offset + u32(relative_slice_offset);
+ }`;
+ };
+
+ return context.compute(
+ {
+ name: 'computeSliceOffsets',
+ shaderCache: { hint: `${inputDims.length}_${sizesFromSliceDimsData.length}`, inputDependencies: ['rank'] },
+ getRunData: () => ({
+ outputs: [{ dims: outputShape, dataType: context.inputs[1].dataType }],
+ dispatchGroup: { x: Math.ceil(numSlices / 64) },
+ programUniforms,
+ }),
+ getShaderSource,
+ },
+ { inputs: [indicesData], outputs: [-1] },
+ )[0];
+};
+
+export const gatherND = (context: ComputeContext, attributes: GatherNDAttributes) => {
+ const inputs = context.inputs;
+ const inputShape = inputs[0].dims;
+ const inputType = inputs[0].dataType;
+ const indicesShape = inputs[1].dims;
+ const numSliceDims = indicesShape[indicesShape.length - 1];
+ const numSlices = ShapeUtil.sizeToDimension(indicesShape, indicesShape.length - 1);
+ const sliceSize = ShapeUtil.sizeFromDimension(inputShape, attributes.batchDims + numSliceDims);
+ const numBatches = ShapeUtil.sizeToDimension(inputShape, attributes.batchDims);
+ const inputBatchStride = ShapeUtil.sizeFromDimension(inputShape, attributes.batchDims);
+ const numSlicesPerBatch = numSlices / numBatches;
+ const sizesFromSliceDims = new Array(numSliceDims);
+ let runningProduct = sliceSize;
+ for (let i = 0; i < numSliceDims; ++i) {
+ sizesFromSliceDims[numSliceDims - 1 - i] = runningProduct;
+ runningProduct *= inputShape[attributes.batchDims + numSliceDims - 1 - i];
+ }
+
+ const inputSliceOffsets = computeSliceOffsets(
+ context,
+ inputs[1],
+ sizesFromSliceDims,
+ attributes.batchDims,
+ inputShape,
+ numSlices,
+ numSlicesPerBatch,
+ inputBatchStride,
+ numSliceDims,
+ );
+
+ const lastIndicesDimension = attributes.batchDims + numSliceDims;
+ if (lastIndicesDimension > inputShape.length) {
+ throw new Error('last dimension of indices must not be larger than rank of input tensor');
+ }
+
+ const outputShape = indicesShape.slice(0, -1).concat(inputShape.slice(lastIndicesDimension));
+ const outputSize = ShapeUtil.size(outputShape);
+
+ const programUniforms: ProgramUniform[] = [
+ { type: DataType.uint32, data: outputSize },
+ { type: DataType.uint32, data: sliceSize },
+ ...createTensorShapeVariables(inputs[0].dims, inputSliceOffsets.dims, outputShape),
+ ];
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const input = inputVariable('data', inputs[0].dataType, inputs[0].dims.length);
+ const indices = inputVariable('slice_offsets', DataType.uint32, inputSliceOffsets.dims.length);
+
+ const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+ return `
+ ${shaderHelper
+ .registerUniform('output_size', 'u32')
+ .registerUniform('slice_size', 'u32')
+ .declareVariables(input, indices, output)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ let slice_offset = slice_offsets[global_idx / uniforms.slice_size];
+ output[global_idx] = data[u32(slice_offset) + global_idx % uniforms.slice_size];
+ }`;
+ };
+ context.compute(
+ {
+ name: 'GatherND',
+ shaderCache: { hint: attributes.cacheKey, inputDependencies: ['rank', 'rank'] },
+ getRunData: () => ({
+ outputs: [{ dims: outputShape, dataType: inputType }],
+ dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+ programUniforms,
+ }),
+ getShaderSource,
+ },
+ { inputs: [inputs[0], inputSliceOffsets] },
+ );
+};
+
+export const parseGatherNDAttributes = (attributes: Record): GatherNDAttributes => {
+ const batchDims = attributes.batch_dims as number;
+ return {
+ batchDims,
+ cacheKey: '',
+ };
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
new file mode 100644
index 0000000000000..50c71472434ad
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
@@ -0,0 +1,279 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey, createAttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramInfo, ProgramUniform } from '../types';
+
+import { createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper } from './common';
+
+let [idxN, idxC, idxH, idxW] = [0, 1, 2, 3]; // NCHW
+type Mode = 'bilinear' | 'nearest' | 'bicubic';
+type PaddingMode = 'zeros' | 'border' | 'reflection';
+type Format = 'NHWC' | 'NCHW';
+export interface GridSampeAttributes extends AttributeWithCacheKey {
+ alignCorners: number;
+ mode: Mode;
+ paddingMode: PaddingMode;
+ format: Format;
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+ if (inputs[0].dims.length !== 4) {
+ throw new Error('only 4-D tensor is supported.');
+ }
+ if (inputs[0].dims.length !== inputs[1].dims.length) {
+ throw new Error('input dimensions must be equal to grid dimensions');
+ }
+
+ if (inputs[0].dims.length - 2 !== inputs[1].dims[inputs[1].dims.length - 1]) {
+ throw new Error(`last dimension of grid must be equal to ${inputs[0].dims.length - 2}`);
+ }
+
+ if (inputs[0].dims[0] !== inputs[1].dims[0]) {
+ throw new Error('grid batch size must match input batch size');
+ }
+};
+
+const gsGetCubicCoeffs = `
+ fn gs_get_cubic_coeffs(x: f32) -> vec4 {
+ let cubic_alpha = -0.75f;
+ let x_abs = abs(x);
+ var coeffs: vec4;
+ coeffs[0] = (((cubic_alpha * (x_abs + 1) - 5 * cubic_alpha) * (x_abs + 1) + 8 * cubic_alpha) * (x_abs + 1) - 4 * cubic_alpha);
+ coeffs[1] = (((cubic_alpha + 2) * x_abs - (cubic_alpha + 3)) * x_abs * x_abs + 1);
+ coeffs[2] = (((cubic_alpha + 2) * (1 - x_abs) - (cubic_alpha + 3)) * (1 - x_abs) * (1 - x_abs) + 1);
+ coeffs[3] = (((cubic_alpha * (2 - x_abs) - 5 * cubic_alpha) * (2 - x_abs) + 8 * cubic_alpha) * (2 - x_abs) - 4 * cubic_alpha);
+ return coeffs;
+ }
+`;
+
+const gsBicubicInterpolate = (dataType: string): string => `
+ fn gs_bicubic_interpolate(p: mat4x4<${dataType}>, x: f32, y: f32) -> ${dataType} {
+ var v: vec4;
+ var coeffs = gs_get_cubic_coeffs(x);
+ for (var i = 0; i < 4; i++) {
+ v[i] = coeffs[0] * p[i][0] + coeffs[1] * p[i][1] + coeffs[2] * p[i][2] + coeffs[3] * p[i][3];
+ }
+ coeffs = gs_get_cubic_coeffs(y);
+ let pixel = ${dataType}(coeffs[0] * v[0] + coeffs[1] * v[1] + coeffs[2] * v[2] + coeffs[3] * v[3]);
+ return pixel;
+ }
+`;
+
+const gsDenormalize = (attributes: GridSampeAttributes): string => `
+ fn gs_denormalize(n: f32, length: i32) -> f32 {
+ ${
+ attributes.alignCorners === 0
+ ? `
+ // alignCorners: false => [-1, 1] to [-0.5, length - 0.5]
+ return ((n + 1.0) * f32(length) - 1.0) / 2.0;
+ `
+ : `
+ // alignCorners: true => [-1, 1] to [0, length - 1]
+ return (n + 1.0) / 2.0 * (f32(length - 1));
+ `
+ }
+ }
+`;
+
+const gsReflect = (attributes: GridSampeAttributes): string => `
+ ${
+ attributes.paddingMode === 'reflection'
+ ? `
+ fn gs_reflect(x: i32, x_min: f32, x_max: f32) -> u32 {
+ var dx = 0.0;
+ var fx = f32(x);
+ let range = x_max - x_min;
+ if (fx < x_min) {
+ dx = x_min - fx;
+ let n = u32(dx / range);
+ let r = dx - f32(n) * range;
+ if (n % 2 == 0) {
+ fx = x_min + r;
+ } else {
+ fx = x_max - r;
+ }
+ } else if (fx > x_max) {
+ dx = fx - x_max;
+ let n = u32(dx / range);
+ let r = dx - f32(n) * range;
+ if (n % 2 == 0) {
+ fx = x_max - r;
+ } else {
+ fx = x_min + r;
+ }
+ }
+ return u32(fx);
+ }`
+ : ''
+ }
+`;
+
+const pixelAtGrid = (input: IndicesHelper, dataType: string, attributes: GridSampeAttributes): string =>
+ `
+ fn pixel_at_grid(r: i32, c: i32, H: i32, W: i32, batch: u32, channel: u32, border: vec4) -> ${dataType} {
+ var pixel = ${dataType}(0);
+ var indices = vec4(0);
+ indices[${idxN}] = batch;
+ indices[${idxC}] = channel;` +
+ (() => {
+ switch (attributes.paddingMode) {
+ case 'zeros':
+ return `
+ if (r >= 0 && r < H && c >=0 && c < W) {
+ indices[${idxH}] = u32(r);
+ indices[${idxW}] = u32(c);
+ }
+ `;
+ case 'border':
+ return `
+ indices[${idxH}] = u32(clamp(r, 0, H - 1));
+ indices[${idxW}] = u32(clamp(c, 0, W - 1));
+ `;
+ case 'reflection':
+ return `
+ indices[${idxH}] = gs_reflect(r, border[1], border[3]);
+ indices[${idxW}] = gs_reflect(c, border[0], border[2]);
+ `;
+ default:
+ throw new Error(`padding mode ${attributes.paddingMode} is not supported`);
+ }
+ })() +
+ `
+ return ${input.getByIndices('indices')};
+ }
+`;
+
+const computePixel = (output: IndicesHelper, dataType: string, attributes: GridSampeAttributes): string =>
+ (() => {
+ switch (attributes.mode) {
+ case 'nearest':
+ return `
+ let result = pixel_at_grid(i32(round(y)), i32(round(x)), H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+ `;
+ case 'bilinear':
+ return `
+ let x1 = i32(floor(x));
+ let y1 = i32(floor(y));
+ let x2 = x1 + 1;
+ let y2 = y1 + 1;
+
+ let p11 = pixel_at_grid(y1, x1, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+ let p12 = pixel_at_grid(y1, x2, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+ let p21 = pixel_at_grid(y2, x1, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+ let p22 = pixel_at_grid(y2, x2, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+
+ let dx2 = ${dataType}(f32(x2) - x);
+ let dx1 = ${dataType}(x - f32(x1));
+ let dy2 = ${dataType}(f32(y2) - y);
+ let dy1 = ${dataType}(y - f32(y1));
+ let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);
+ `;
+ case 'bicubic':
+ return `
+ let x0 = i32(floor(x)) - 1;
+ let y0 = i32(floor(y)) - 1;
+ var p: mat4x4<${dataType}>;
+ for (var h = 0; h < 4; h++) {
+ for (var w = 0; w < 4; w++) {
+ p[h][w] = pixel_at_grid(h + y0, w + x0, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+ }
+ }
+
+ let dx = x - f32(x0 + 1);
+ let dy = y - f32(y0 + 1);
+ let result = gs_bicubic_interpolate(p, dx, dy);
+ `;
+ default:
+ throw new Error(`mode ${attributes.mode} is not supported`);
+ }
+ })() + `${output.setByOffset('global_idx', 'result')}`;
+
+const createGridSampleProgramInfo = (inputs: readonly TensorView[], attributes: GridSampeAttributes): ProgramInfo => {
+ const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length);
+ // discard last dimension for using vec2 to access grid data
+ const gridShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2]];
+ const grid = inputVariable('grid', inputs[1].dataType, gridShape.length, 2);
+ let outputShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[1].dims[1], inputs[1].dims[2]];
+ if (attributes.format === 'NHWC') {
+ outputShape = [inputs[0].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[0].dims[3]];
+ [idxN, idxC, idxH, idxW] = [0, 3, 1, 2];
+ }
+ const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+ const dataType = x.type.value;
+ const outputSize = ShapeUtil.size(outputShape);
+
+ const programUniforms: ProgramUniform[] = [
+ { type: DataType.uint32, data: outputSize },
+ ...createTensorShapeVariables(inputs[0].dims, gridShape, outputShape),
+ ];
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => `
+ ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(x, grid, output)}
+ ${gsGetCubicCoeffs}
+ ${gsBicubicInterpolate(dataType)}
+ ${gsDenormalize(attributes)}
+ ${gsReflect(attributes)}
+ ${pixelAtGrid(x, dataType, attributes)}
+
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ let H_in = i32(uniforms.x_shape[${idxH}]);
+ let W_in = i32(uniforms.x_shape[${idxW}]);
+
+ ${
+ attributes.alignCorners === 0
+ ? `
+ let x_min = -0.5;
+ let x_max = f32(W_in) - 0.5;
+ let y_min = -0.5;
+ let y_max = f32(H_in) - 0.5;
+ `
+ : `
+ let x_min = 0.0;
+ let x_max = f32(W_in) - 1.0;
+ let y_min = 0.0;
+ let y_max = f32(H_in) - 1.0;
+ `
+ };
+ let border = vec4(x_min, y_min, x_max, y_max);
+
+ let indices = ${output.offsetToIndices('global_idx')};
+ var grid_indices = vec3(indices[${idxN}], indices[${idxH}], indices[${idxW}]);
+ let nxy = ${grid.getByIndices('grid_indices')};
+ var x = gs_denormalize(f32(nxy[0]), W_in);
+ var y = gs_denormalize(f32(nxy[1]), H_in);
+
+ ${computePixel(output, dataType, attributes)}
+ }`;
+
+ return {
+ name: 'GridSample',
+ shaderCache: { hint: `${attributes.cacheKey}`, inputDependencies: ['type', 'type'] },
+ getRunData: (inputs) => {
+ const outputSize = ShapeUtil.size(outputShape);
+ return {
+ outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+ dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+ programUniforms,
+ };
+ },
+ getShaderSource,
+ };
+};
+
+export const gridSample = (context: ComputeContext, attributes: GridSampeAttributes): void => {
+ validateInputs(context.inputs);
+ context.compute(createGridSampleProgramInfo(context.inputs, attributes));
+};
+
+export const parseGridSampleAttributes = (attributes: Record): GridSampeAttributes =>
+ createAttributeWithCacheKey({
+ alignCorners: attributes.align_corners as number,
+ mode: attributes.mode as Mode,
+ paddingMode: attributes.padding_mode as PaddingMode,
+ format: attributes.format as Format,
+ });
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index e40cfa5200a08..327e972257d35 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -42,6 +42,7 @@ export const matMul = (context: ComputeContext): void => {
const batchA = ShapeUtil.size(context.inputs[0].dims.slice(0, -2));
const batchB = ShapeUtil.size(context.inputs[1].dims.slice(0, -2));
if (batchA !== 1 && M === 1 && batchB === 1) {
+ // Optimization for batched vec-mat-mul
const reshapedA = context.inputs[0].reshape([1, batchA, K]);
const reshapedB = context.inputs[1].reshape([1, K, N]);
const matmulOutputShape = [1, batchA, N];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
new file mode 100644
index 0000000000000..8c24232d63c0c
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
@@ -0,0 +1,177 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey, createAttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramInfo, ProgramUniform } from '../types';
+
+import {
+ atomicOutputVariable,
+ createTensorShapeVariables,
+ inputVariable,
+ outputVariable,
+ ShaderHelper,
+} from './common';
+
+export interface ScatterNDAttributes extends AttributeWithCacheKey {
+ reduction: string;
+}
+
+type ReductionType = 'i32' | 'u32' | 'f32';
+
+const atomicReductionSnippet = (reduction: string, ptr: string, v: string, type: ReductionType) => {
+ if (reduction !== 'none' && type !== 'i32' && type !== 'u32' && type !== 'f32') {
+ throw new Error(`Input ${type} is not supported with reduction ${reduction}.`);
+ }
+
+ const floatStart = `{
+ var oldValue = 0;
+ loop {
+ let newValueF32 =`;
+ const floatEnd = `;
+ let newValue = bitcast(newValueF32);
+ let res = atomicCompareExchangeWeak(&${ptr}, oldValue, newValue);
+ if res.exchanged {
+ break;
+ }
+ oldValue = res.old_value;
+ }
+ }`;
+
+ switch (reduction) {
+ case 'none':
+ return `${ptr}=${v};`;
+ case 'add':
+ if (type === 'i32' || type === 'u32') {
+ return `atomicAdd(&${ptr}, bitcast<${type}>(${v}));`;
+ } else {
+ // atomicAdd only supports uint/int type. For float, we use
+ // atomicCompareExchangeWeak to simulate.
+ return `
+ ${floatStart}bitcast<${type}>(oldValue) + (${v})${floatEnd}`;
+ }
+ case 'max':
+ if (type === 'i32' || type === 'u32') {
+ return `atomicMax(&${ptr}, bitcast<${type}>(${v}));`;
+ } else {
+ // atomicMax only supports uint/int type. For float, we use
+ // atomicCompareExchangeWeak to simulate.
+ return `
+ ${floatStart}max(bitcast(oldValue), (${v}))${floatEnd}`;
+ }
+ case 'min':
+ if (type === 'i32' || type === 'u32') {
+ return `atomicMin(&${ptr}, bitcast<${type}>(${v}));`;
+ } else {
+ // atomicMin only supports uint/int type. For float, we use
+ // atomicCompareExchangeWeak to simulate.
+ return `${floatStart}min(bitcast<${type}>(oldValue), (${v}))${floatEnd}`;
+ }
+ case 'mul':
+ // atomicMul is not supported, we use atomicCompareExchangeWeak to simulate.
+ return `${floatStart}(bitcast<${type}>(oldValue) * (${v}))${floatEnd}`;
+
+ default:
+ throw new Error(`Reduction ${reduction} is not supported.`);
+ }
+};
+
+const createScatterNDProgramInfo = (inputs: readonly TensorView[], attributes: ScatterNDAttributes): ProgramInfo => {
+ const inputShape = inputs[0].dims;
+ const indicesShape = inputs[1].dims;
+ const outputShape = inputShape;
+ // TODO: support bool with components 4.
+ const components = 1;
+ const outputSize = Math.ceil(ShapeUtil.size(indicesShape) / components);
+ const lastIndexDimension = indicesShape[indicesShape.length - 1];
+ const numUpdatesElements = ShapeUtil.sizeFromDimension(inputShape, lastIndexDimension);
+
+ const programUniforms: ProgramUniform[] = [
+ { type: DataType.uint32, data: outputSize },
+ { type: DataType.uint32, data: lastIndexDimension },
+ { type: DataType.uint32, data: numUpdatesElements },
+ ...createTensorShapeVariables(inputs[1].dims, inputs[2].dims, outputShape),
+ ];
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const indices = inputVariable('indices', inputs[1].dataType, inputs[1].dims.length);
+ const updates = inputVariable('updates', inputs[2].dataType, inputs[2].dims.length, components);
+ const output =
+ attributes.reduction !== 'none' && attributes.reduction !== ''
+ ? atomicOutputVariable('output', inputs[0].dataType, outputShape.length)
+ : outputVariable('output', inputs[0].dataType, outputShape.length, components);
+
+ return `
+ ${shaderHelper
+ .registerUniform('output_size', 'u32')
+ .registerUniform('last_index_dimension', 'u32')
+ .registerUniform('num_updates_elements', 'u32')
+ .declareVariables(indices, updates, output)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ var data_offset = 0u;
+ let indices_start = uniforms.last_index_dimension * global_idx;
+ let indices_end = indices_start + uniforms.last_index_dimension;
+ for (var i = indices_start; i < indices_end; i++) {
+ var index = i32(indices[i].x);
+ ${
+ inputs[0].dims.length === 1
+ ? `
+ let element_count_dim = uniforms.output_strides;
+ let dim_value = uniforms.output_shape;`
+ : `
+ let element_count_dim = uniforms.output_strides[i - indices_start];
+ let dim_value = uniforms.output_shape[i - indices_start + uniforms.last_index_dimension];`
+ }
+ if (index >= 0) {
+ if (index >= i32(dim_value)) {
+ index = i32(dim_value - 1);
+ }
+ } else {
+ if (index < -i32(dim_value)) {
+ index = 0;
+ } else {
+ index += i32(dim_value);
+ }
+ }
+ data_offset += u32((u32(index) * element_count_dim));
+ }
+
+ for (var i = 0u; i < uniforms.num_updates_elements; i++) {
+ let value = updates[uniforms.num_updates_elements * global_idx + i];
+ ${atomicReductionSnippet(
+ attributes.reduction,
+ 'output[data_offset + i]',
+ 'value',
+ output.type.value as ReductionType,
+ )}
+ }
+
+ }`;
+ };
+ return {
+ name: 'ScatterND',
+ shaderCache: {
+ hint: `${attributes.cacheKey}_${attributes.reduction}`,
+ inputDependencies: ['rank', 'rank'],
+ },
+ getRunData: () => ({
+ outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+ dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+ programUniforms,
+ }),
+ getShaderSource,
+ };
+};
+
+export const parseScatterNDAttributes = (attributes: Record): ScatterNDAttributes =>
+ createAttributeWithCacheKey({ reduction: attributes.reduction as string });
+
+export const scatterND = (context: ComputeContext, attributes: ScatterNDAttributes): void => {
+ context.compute(createScatterNDProgramInfo(context.inputs, attributes), {
+ inputs: [context.inputs[1], context.inputs[2]],
+ outputs: [],
+ });
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 1fd99d085e0ed..5059645211aea 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -29,7 +29,9 @@ const permFunctionBody = (perm: number[], rank: number, input: IndicesHelper, ou
let reverseFunc = `fn perm(i: ${output.type.indices}) -> ${input.type.indices} {
var a: ${input.type.indices};`;
for (let i = 0; i < rank; ++i) {
- reverseFunc += input.indicesSet('a', perm[i], `i[${i}]`);
+ // input indices and output indices should always be larger or equal to 2,
+ // so indexer is always valid to be used on `a` and `i`.
+ reverseFunc += `a[${perm[i]}]=i[${i}];`;
}
return (reverseFunc += 'return a;}');
};
@@ -48,17 +50,61 @@ const squeezeShape = (shape: readonly number[], adjustedPerm: number[]): { newSh
return { newShape, newPerm };
};
+const isTransposeReshape = (perm: number[], shape: readonly number[]) => {
+ // As long as the dims with values > 1 stay in the same order, it's a reshape.
+ // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
+ let lastPermutedAxis = 0;
+ for (let i = 0; i < perm.length; ++i) {
+ if (shape[perm[i]] === 1) {
+ continue;
+ }
+ if (perm[i] < lastPermutedAxis) {
+ return false;
+ }
+ lastPermutedAxis = perm[i];
+ }
+ return true;
+};
+
export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: number[]): ProgramInfo => {
const inputDataType = inputTensor.dataType;
const inputRank = inputTensor.dims.length;
const perm = getAdjustedPerm(inputRank, permAttr);
const outputShape = getOutputShape(inputTensor.dims, perm);
+ let newInputShape = inputTensor.dims;
+ let newOutputShape = outputShape;
+ const transposeAsReshape = inputRank < 2 || isTransposeReshape(perm, inputTensor.dims);
+ let getShaderSource;
+ if (transposeAsReshape) {
+ getShaderSource = (shaderHelper: ShaderHelper) => {
+ const input = inputVariable('input', inputDataType, newInputShape, 4);
+ const output = outputVariable('output', inputDataType, newOutputShape, 4);
+ return `
+ ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ output[global_idx] = input[global_idx];
+ }`;
+ };
+
+ return {
+ name: 'TransposeCopy',
+ shaderCache: { inputDependencies: ['type'] },
+ getRunData: () => {
+ const outputSize = ShapeUtil.size(outputShape);
+ return {
+ outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
+ dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* components */) },
+ programUniforms: [{ type: DataType.uint32, data: Math.ceil(outputSize / 4) }],
+ };
+ },
+ getShaderSource,
+ };
+ }
const { newShape, newPerm } = squeezeShape(inputTensor.dims, perm);
const channelsLast = ShapeUtil.areEqual(newPerm, [2, 3, 1]);
const channelsFirst = ShapeUtil.areEqual(newPerm, [3, 1, 2]);
- const useShared = (newShape.length === 2 && newPerm[0] > newPerm[1]) || channelsLast || channelsFirst;
- let newInputShape = useShared ? newShape : inputTensor.dims;
- let newOutputShape = outputShape;
+ const useShared = newShape.length === 2 || channelsLast || channelsFirst;
if (useShared) {
newInputShape = channelsLast
? [newShape[0], newShape[1] * newShape[2]]
@@ -66,13 +112,11 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
? [newShape[0] * newShape[1], newShape[2]]
: newShape;
newOutputShape = [newInputShape[1], newInputShape[0]];
- }
- const input = inputVariable('a', inputDataType, newInputShape.length);
- const output = outputVariable('output', inputDataType, newOutputShape.length);
- const tileSize = 16;
- let getShaderSource;
- if (useShared) {
- getShaderSource = (shaderHelper: ShaderHelper) => `
+ const tileSize = 16;
+ getShaderSource = (shaderHelper: ShaderHelper) => {
+ const input = inputVariable('a', inputDataType, newInputShape.length);
+ const output = outputVariable('output', inputDataType, newOutputShape.length);
+ return `
${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
var tile : array, ${tileSize}>;
${shaderHelper.mainStart([tileSize, tileSize, 1])}
@@ -92,8 +136,29 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
${output.setByIndices(`${output.type.indices}(output_row, output_col)`, 'tile[local_id.x][local_id.y]')}
}
}`;
- } else {
- getShaderSource = (shaderHelper: ShaderHelper) => `
+ };
+ return {
+ name: 'TransposeShared',
+ shaderCache: { inputDependencies: ['type'] },
+ getRunData: () => {
+ const outputSize = ShapeUtil.size(outputShape);
+ return {
+ outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
+ dispatchGroup: { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) },
+ programUniforms: [
+ { type: DataType.uint32, data: outputSize },
+ ...createTensorShapeVariables(newInputShape, newOutputShape),
+ ],
+ };
+ },
+ getShaderSource,
+ };
+ }
+
+ getShaderSource = (shaderHelper: ShaderHelper) => {
+ const input = inputVariable('a', inputDataType, newInputShape.length);
+ const output = outputVariable('output', inputDataType, newOutputShape.length);
+ return `
${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
${permFunctionBody(perm, inputRank, input, output)}
@@ -106,17 +171,15 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
${output.setByOffset('global_idx', input.getByIndices('aIndices'))}
}`;
- }
+ };
return {
- name: useShared ? 'TransposeShared' : 'Transpose',
+ name: 'Transpose',
shaderCache: { hint: `${permAttr}`, inputDependencies: ['rank'] },
getRunData: () => {
const outputSize = ShapeUtil.size(outputShape);
return {
outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
- dispatchGroup: useShared
- ? { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) }
- : { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+ dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
programUniforms: [
{ type: DataType.uint32, data: outputSize },
...createTensorShapeVariables(newInputShape, newOutputShape),
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 042b8d9efaae9..2c5180c5db3ee 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -93,16 +93,23 @@ export class ProgramManager {
build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
TRACE_FUNC_BEGIN(programInfo.name);
const device = this.backend.device;
- const extensions: string[] = [];
- if (device.features.has('shader-f16')) {
- extensions.push('enable f16;');
- }
- if (device.features.has('chromium-experimental-subgroups')) {
- extensions.push('enable chromium_experimental_subgroups;');
- }
+ const enableDirectives: string[] = [];
+
+ // Enable WGSL extensions based on available WebGPU features
+ const extensionsInfo: Array<{ feature: GPUFeatureName; extension: string }> = [
+ { feature: 'shader-f16', extension: 'f16' },
+ { feature: 'subgroups' as GPUFeatureName, extension: 'subgroups' },
+ { feature: 'subgroups-f16' as GPUFeatureName, extension: 'subgroups_f16' },
+ ];
+ extensionsInfo.forEach((info) => {
+ if (device.features.has(info.feature)) {
+ enableDirectives.push(`enable ${info.extension};`);
+ }
+ });
+
const shaderHelper = createShaderHelper(normalizedDispatchGroupSize, this.backend.device.limits);
const userCode = programInfo.getShaderSource(shaderHelper);
- const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
+ const code = `${enableDirectives.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
const shaderModule = device.createShaderModule({ code, label: programInfo.name });
LOG_DEBUG('verbose', () => `[WebGPU] ${programInfo.name} shader code: ${code}`);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 3b3c55733c973..9321ac170d036 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -21,6 +21,11 @@ export interface AdapterInfo {
isArchitecture: (architecture: GpuArchitecture) => boolean;
isVendor: (vendor: GpuVendor) => boolean;
}
+export interface DeviceInfo {
+ readonly subgroupsSupported: boolean;
+ readonly subgroupsF16Supported: boolean;
+ readonly subgroupSizeRange?: readonly [number, number];
+}
export interface GpuData {
type: GpuDataType;
@@ -160,6 +165,11 @@ export interface ComputeContext {
*/
readonly adapterInfo: AdapterInfo;
+ /**
+ * gpu device info
+ */
+ readonly deviceInfo: DeviceInfo;
+
/**
* stores the pointer to OpKernelContext
*/
@@ -187,8 +197,6 @@ export interface ComputeContext {
compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[];
output(index: number, dims: readonly number[]): number;
- getMaxComputeWorkgroupSizes(): [number, number, number];
- getMaxComputeWorkgroupStoragesize(): number;
}
export type TimestampQuery = 'none' | 'inside-passes' | 'at-passes';
diff --git a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
index a19afd4bac732..4932691bda65b 100644
--- a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
+++ b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
@@ -78,7 +78,7 @@ const calculateByteLength = (dataType: MLOperandDataType, shape: readonly number
if (!size) {
throw new Error('Unsupported data type.');
}
- return Math.ceil((shape.reduce((a, b) => a * b) * size) / 8);
+ return shape.length > 0 ? Math.ceil((shape.reduce((a, b) => a * b) * size) / 8) : 0;
};
/**
@@ -195,7 +195,7 @@ class TensorIdTracker {
}
// eslint-disable-next-line no-bitwise
- const usage = MLTensorUsage.READ | MLTensorUsage.WRITE;
+ const usage = typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.READ | MLTensorUsage.WRITE;
this.wrapper = await this.tensorManager.getCachedTensor(dataType, shape, usage, true, true);
if (copyOld && this.activeUpload) {
@@ -349,7 +349,7 @@ class TensorManagerImpl implements TensorManager {
public async getCachedTensor(
dataType: MLOperandDataType,
shape: readonly number[],
- usage: MLTensorUsageFlags,
+ usage: MLTensorUsageFlags | undefined,
writable: boolean,
readable: boolean,
): Promise {
diff --git a/js/web/lib/wasm/jsep/webnn/webnn.d.ts b/js/web/lib/wasm/jsep/webnn/webnn.d.ts
index ca06f5c695f8d..c513b2ec2ed8b 100644
--- a/js/web/lib/wasm/jsep/webnn/webnn.d.ts
+++ b/js/web/lib/wasm/jsep/webnn/webnn.d.ts
@@ -400,7 +400,8 @@ declare const MLTensorUsage: {
};
interface MLTensorDescriptor extends MLOperandDescriptor {
- usage: MLTensorUsageFlags;
+ /** @deprecated Use readable/writeable instead of usage */
+ usage: MLTensorUsageFlags | undefined;
importableToWebGPU?: boolean;
readable?: boolean;
writable?: boolean;
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index f3794a72efbe8..da8939cd0263a 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -291,9 +291,6 @@ export const createSession = async (
const providerName = typeof provider === 'string' ? provider : provider.name;
if (providerName === 'webnn') {
wasm.shouldTransferToMLTensor = false;
- if (wasm.currentContext) {
- throw new Error('WebNN execution provider is already set.');
- }
if (typeof provider !== 'string') {
const webnnOptions = provider as InferenceSession.WebNNExecutionProviderOption;
const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context;
@@ -490,7 +487,7 @@ export const prepareInputOutputTensor = (
}
if (location === 'gpu-buffer') {
- const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
+ const gpuBuffer = tensor[2].gpuBuffer;
dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
const registerBuffer = wasm.jsepRegisterBuffer;
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 40c614fdf866a..ebeac5dc9e587 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -232,6 +232,23 @@ export declare namespace JSEP {
* @returns
*/
jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise;
+
+ /**
+ * [exported from pre-jsep.js] Register a WebNN Constant operand from external data.
+ * @param externalFilePath - specify the external file path.
+ * @param dataOffset - specify the external data offset.
+ * @param dataLength - specify the external data length.
+ * @param builder - specify the MLGraphBuilder used for constructing the Constant.
+ * @param desc - specify the MLOperandDescriptor of the Constant.
+ * @returns the WebNN Constant operand for the specified external data.
+ */
+ jsepRegisterMLConstant(
+ externalFilePath: string,
+ dataOffset: number,
+ dataLength: number,
+ builder: MLGraphBuilder,
+ desc: MLOperandDescriptor,
+ ): MLOperand;
}
}
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 894667ad58933..07c8f0bf3b940 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -861,9 +861,9 @@
}
},
"node_modules/cross-spawn": {
- "version": "6.0.5",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
- "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+ "version": "6.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.6.tgz",
+ "integrity": "sha512-VqCUuhcd1iB+dsv8gxPttb5iZh/D0iubSP21g36KXdEuf6I5JiioesUVjpCdHV9MZRUfVFlvwtIUyPfxo5trtw==",
"dev": true,
"dependencies": {
"nice-try": "^1.0.4",
@@ -4312,9 +4312,9 @@
}
},
"cross-spawn": {
- "version": "6.0.5",
- "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
- "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+ "version": "6.0.6",
+ "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.6.tgz",
+ "integrity": "sha512-VqCUuhcd1iB+dsv8gxPttb5iZh/D0iubSP21g36KXdEuf6I5JiioesUVjpCdHV9MZRUfVFlvwtIUyPfxo5trtw==",
"dev": true,
"requires": {
"nice-try": "^1.0.4",
diff --git a/js/web/package.json b/js/web/package.json
index 656cd7b56b039..181d6127f5455 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -83,7 +83,7 @@
"types": "./types.d.ts"
},
"./wasm": {
- "import": "./dist/ort.wasm.min.mjs",
+ "import": "./dist/ort.wasm.bundle.min.mjs",
"require": "./dist/ort.wasm.min.js",
"types": "./types.d.ts"
},
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 408f9e00a5cbd..529e9d1065e69 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -591,14 +591,14 @@ async function main() {
// ort[.min].[m]js
await addAllWebBuildTasks({
outputName: 'ort',
- define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true' },
+ define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
});
// ort.bundle.min.mjs
await buildOrt({
isProduction: true,
outputName: 'ort.bundle',
format: 'esm',
- define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true' },
+ define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true' },
});
// ort.webgpu[.min].[m]js
@@ -619,6 +619,13 @@ async function main() {
outputName: 'ort.wasm',
define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
});
+ // ort.wasm.bundle.min.mjs
+ await buildOrt({
+ isProduction: true,
+ outputName: 'ort.wasm.bundle',
+ format: 'esm',
+ define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
+ });
// ort.webgl[.min].[m]js
await addAllWebBuildTasks({
outputName: 'ort.webgl',
diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc
index 613b4507b2b15..8fbe9339feb9b 100644
--- a/js/web/test/data/ops/expand.jsonc
+++ b/js/web/test/data/ops/expand.jsonc
@@ -134,6 +134,56 @@
"type": "float32"
}
]
+ },
+ {
+ "name": "Expand in components = 1, out components = 4",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6],
+ "dims": [3, 2, 1],
+ "type": "float32"
+ },
+ {
+ "data": [3, 1, 8],
+ "dims": [3],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+ 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6
+ ],
+ "dims": [3, 2, 8],
+ "type": "float32"
+ }
+ ]
+ },
+ {
+ "name": "Expand in components = 4, out components = 4",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ "dims": [1, 1, 2, 8],
+ "type": "float32"
+ },
+ {
+ "data": [2, 1, 8],
+ "dims": [3],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16
+ ],
+ "dims": [1, 2, 2, 8],
+ "type": "float32"
+ }
+ ]
}
]
},
diff --git a/js/web/test/data/ops/gather-nd.jsonc b/js/web/test/data/ops/gather-nd.jsonc
new file mode 100644
index 0000000000000..209c7d1f74087
--- /dev/null
+++ b/js/web/test/data/ops/gather-nd.jsonc
@@ -0,0 +1,147 @@
+[
+ {
+ "name": "GatherND int32",
+ "operator": "GatherND",
+ "attributes": [],
+ "cases": [
+ {
+ "name": "data[4] indices[]",
+ "inputs": [
+ {
+ "data": [100, 101, 102, 777, 778, 779, 1000, 1001, 1002],
+ "dims": [9],
+ "type": "int32"
+ },
+ {
+ "data": [0, 4, 8],
+ "dims": [3, 1],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [100, 778, 1002],
+ "dims": [3],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "GatherND float32",
+ "operator": "GatherND",
+ "attributes": [],
+ "cases": [
+ {
+ "name": "data[4] indices[]",
+ "inputs": [
+ {
+ "data": [100.1, 101.2, 102.3, 777.4, 778.5, 779.6, 1000.7, 1001.8, 1002.9],
+ "dims": [9],
+ "type": "float32"
+ },
+ {
+ "data": [0, 4, 8],
+ "dims": [3, 1],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [100.0999984741211, 778.5, 1002.9000244140625],
+ "dims": [3],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "GatherND int32 [2 2 2], batch_dims",
+ "operator": "GatherND",
+ "attributes": [{ "name": "batch_dims", "data": 1, "type": "int" }],
+ "cases": [
+ {
+ "name": "data[4] indices[]",
+ "inputs": [
+ {
+ "data": [0, 1, 2, 3, 4, 5, 6, 7],
+ "dims": [2, 2, 2],
+ "type": "int32"
+ },
+ {
+ "data": [1, 0],
+ "dims": [2, 1],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [2, 3, 4, 5],
+ "dims": [2, 2],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "GatherND float16",
+ "operator": "GatherND",
+ "attributes": [],
+ "cases": [
+ {
+ "name": "data[4] indices[]",
+ "inputs": [
+ {
+ "data": [100.1, 101.2, 102.3, 777.4, 778.5, 779.6, 1000.7, 1001.8, 1002.9],
+ "dims": [9],
+ "type": "float16"
+ },
+ {
+ "data": [0, 4, 8],
+ "dims": [3, 1],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [100.0999984741211, 778.5, 1002.9000244140625],
+ "dims": [3],
+ "type": "float16"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "GatherND uint32 [2 2 2], batch_dims",
+ "operator": "GatherND",
+ "attributes": [{ "name": "batch_dims", "data": 1, "type": "int" }],
+ "cases": [
+ {
+ "name": "data[4] indices[]",
+ "inputs": [
+ {
+ "data": [0, 1, 2, 3, 4, 5, 6, 7],
+ "dims": [2, 2, 2],
+ "type": "uint32"
+ },
+ {
+ "data": [1, 0],
+ "dims": [2, 1],
+ "type": "int64"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [2, 3, 4, 5],
+ "dims": [2, 2],
+ "type": "uint32"
+ }
+ ]
+ }
+ ]
+ }
+]
diff --git a/js/web/test/data/ops/matmul.jsonc b/js/web/test/data/ops/matmul.jsonc
index ead6427350bca..f5996db1aecb6 100644
--- a/js/web/test/data/ops/matmul.jsonc
+++ b/js/web/test/data/ops/matmul.jsonc
@@ -363,6 +363,100 @@
"type": "float32"
}
]
+ },
+ {
+ "name": "same ranks different broadcast small 0",
+ "inputs": [
+ {
+ "data": [0, 1, 2, 3, 4, 5, 6, 7],
+ "dims": [1, 2, 2, 2],
+ "type": "float32"
+ },
+ {
+ "data": [8, 9, 10, 11],
+ "dims": [2, 1, 2, 1],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [9, 43, 77, 111, 11, 53, 95, 137],
+ "dims": [2, 2, 2, 1],
+ "type": "float32"
+ }
+ ]
+ },
+ {
+ "name": "same ranks different broadcast small 1",
+ "inputs": [
+ {
+ "data": [0, 1, 2, 3, 4, 5, 6, 7],
+ "dims": [2, 1, 2, 2],
+ "type": "float32"
+ },
+ {
+ "data": [8, 9, 10, 11],
+ "dims": [1, 2, 2, 1],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [9, 43, 11, 53, 77, 111, 95, 137],
+ "dims": [2, 2, 2, 1],
+ "type": "float32"
+ }
+ ]
+ },
+ {
+ "name": "same ranks different broadcast larger 0",
+ "inputs": [
+ {
+ "data": [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31
+ ],
+ "dims": [1, 2, 2, 8],
+ "type": "float32"
+ },
+ {
+ "data": [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+ "dims": [2, 1, 8, 1],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1036, 3308, 5580, 7852, 1260, 4044, 6828, 9612],
+ "dims": [2, 2, 2, 1],
+ "type": "float32"
+ }
+ ]
+ },
+ {
+ "name": "same ranks different broadcast larger 1",
+ "inputs": [
+ {
+ "data": [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31
+ ],
+ "dims": [2, 1, 2, 8],
+ "type": "float32"
+ },
+ {
+ "data": [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+ "dims": [1, 2, 8, 1],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1036, 3308, 1260, 4044, 5580, 7852, 6828, 9612],
+ "dims": [2, 2, 2, 1],
+ "type": "float32"
+ }
+ ]
}
]
}
diff --git a/js/web/test/data/ops/scatternd.jsonc b/js/web/test/data/ops/scatternd.jsonc
new file mode 100644
index 0000000000000..5135bb9e4d3a5
--- /dev/null
+++ b/js/web/test/data/ops/scatternd.jsonc
@@ -0,0 +1,472 @@
+[
+ {
+ "name": "ScatterND int32",
+ "operator": "ScatterND",
+ "attributes": [],
+ "opset": { "domain": "", "version": 13 },
+ "cases": [
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8],
+ "dims": [8],
+ "type": "int32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9, 10, 11, 12],
+ "dims": [1, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1, 11, 3, 10, 9, 6, 7, 12],
+ "dims": [8],
+ "type": "int32"
+ }
+ ]
+ },
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64
+ ],
+ "dims": [4, 4, 4],
+ "type": "int32"
+ },
+ {
+ "data": [1, 2],
+ "dims": [2, 1],
+ "type": "int64"
+ },
+ {
+ "data": [
+ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+ 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131
+ ],
+ "dims": [2, 4, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+ 131, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+ ],
+ "dims": [4, 4, 4],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND float32",
+ "operator": "ScatterND",
+ "attributes": [],
+ "opset": { "domain": "", "version": 13 },
+ "cases": [
+ {
+ "name": "float32",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1.1, 11.3, 3.1, 10.2, 9.1, 6.1, 7.8, 12.5],
+ "dims": [8],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND add int32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "add", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8],
+ "dims": [8],
+ "type": "int32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9, 10, 11, 12],
+ "dims": [1, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1, 13, 3, 14, 14, 6, 7, 20],
+ "dims": [8],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND add float32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "add", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "float32",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1.100000023841858, 13.5, 3.0999999046325684, 14.699999809265137, 14.40000057220459, 6.099999904632568,
+ 7.800000190734863, 21.399999618530273
+ ],
+ "dims": [8],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND mul int32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [11, 22, 31, 45, 53, 61, 78, 89],
+ "dims": [8],
+ "type": "int32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [91, 102, 113, 125],
+ "dims": [1, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [11, 2486, 31, 4590, 4823, 61, 78, 11125],
+ "dims": [8],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND min int32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "min", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [11, 22, 31, 45, 53, 61, 78, 89],
+ "dims": [8],
+ "type": "int32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [91, 102, 113, 125],
+ "dims": [1, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [11, 22, 31, 45, 53, 61, 78, 89],
+ "dims": [8],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND max int32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "max", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "int32",
+ "inputs": [
+ {
+ "data": [11, 22, 31, 45, 53, 61, 78, 89],
+ "dims": [8],
+ "type": "int32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [91, 102, 113, 125],
+ "dims": [1, 4],
+ "type": "int32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [11, 113, 31, 102, 91, 61, 78, 125],
+ "dims": [8],
+ "type": "int32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND mul float32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "float32",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1.100000023841858, 24.860000610351562, 3.0999999046325684, 45.89999771118164, 48.230003356933594,
+ 6.099999904632568, 7.800000190734863, 111.24999237060547
+ ],
+ "dims": [8],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND min float32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "min", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "float32",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1.100000023841858, 2.200000047683716, 3.0999999046325684, 4.5, 5.300000190734863, 6.099999904632568,
+ 7.800000190734863, 8.899999618530273
+ ],
+ "dims": [8],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND max float32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "max", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "float32",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 1.100000023841858, 11.300000190734863, 3.0999999046325684, 10.199999809265137, 9.100000381469727,
+ 6.099999904632568, 7.800000190734863, 12.5
+ ],
+ "dims": [8],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND float16",
+ "operator": "ScatterND",
+ "attributes": [],
+ "opset": { "domain": "", "version": 11 },
+ "cases": [
+ {
+ "name": "float16",
+ "inputs": [
+ {
+ "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+ "dims": [8],
+ "type": "float16"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [9.1, 10.2, 11.3, 12.5],
+ "dims": [1, 4],
+ "type": "float16"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1.1, 11.3, 3.1, 10.2, 9.1, 6.1, 7.8, 12.5],
+ "dims": [8],
+ "type": "float16"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "ScatterND mul uint32",
+ "operator": "ScatterND",
+ "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+ "opset": { "domain": "", "version": 16 },
+ "cases": [
+ {
+ "name": "uint32",
+ "inputs": [
+ {
+ "data": [11, 22, 31, 45, 53, 61, 78, 89],
+ "dims": [8],
+ "type": "uint32"
+ },
+ {
+ "data": [4, 3, 1, 7],
+ "dims": [1, 4, 1],
+ "type": "int64"
+ },
+ {
+ "data": [91, 102, 113, 125],
+ "dims": [1, 4],
+ "type": "uint32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [11, 2486, 31, 4590, 4823, 61, 78, 11125],
+ "dims": [8],
+ "type": "uint32"
+ }
+ ]
+ }
+ ]
+ }
+]
diff --git a/js/web/test/data/ops/transpose.jsonc b/js/web/test/data/ops/transpose.jsonc
index a7265d6444118..d431ceb1712a5 100644
--- a/js/web/test/data/ops/transpose.jsonc
+++ b/js/web/test/data/ops/transpose.jsonc
@@ -263,6 +263,30 @@
}
]
},
+ {
+ "name": "Transpose as reshape - perms:[1, 0, 2, 4, 3]",
+ "operator": "Transpose",
+ "attributes": [{ "name": "perm", "data": [1, 0, 2, 4, 3], "type": "ints" }],
+ "cases": [
+ {
+ "name": "T[3, 1, 2, 1, 4]",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
+ "dims": [3, 1, 2, 1, 4],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
+ "dims": [1, 3, 2, 4, 1],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
{
"name": "Transpose - perms:[1, 0]",
"operator": "Transpose",
diff --git a/js/web/test/e2e/browser-test-wasm-binary-override.js b/js/web/test/e2e/browser-test-wasm-binary-override.js
index 471c26f6990b5..27cce2ca06236 100644
--- a/js/web/test/e2e/browser-test-wasm-binary-override.js
+++ b/js/web/test/e2e/browser-test-wasm-binary-override.js
@@ -7,7 +7,7 @@ const documentUrl = document.currentScript.src;
it('Browser E2E testing - WebAssembly backend', async function () {
// preload .wasm file binary
- const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.wasm', documentUrl).href;
+ const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.wasm', documentUrl).href;
const response = await fetch(wasmUrl);
// make sure the .wasm file is loaded successfully
diff --git a/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js b/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js
new file mode 100644
index 0000000000000..d325a5ca7187d
--- /dev/null
+++ b/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+it('Browser E2E testing - WebAssembly backend (path override filename)', async function () {
+ // check base URL port from test args
+ if (typeof __ort_arg_port === 'undefined') {
+ throw new Error('test flag --port= is required');
+ }
+ const base = `http://localhost:${__ort_arg_port}/`;
+
+ ort.env.wasm.wasmPaths = {};
+
+ if (typeof __ort_arg_files === 'string' && __ort_arg_files.includes('wasm')) {
+ const overrideWasmUrl = new URL('./test-wasm-path-override/jsep-renamed.wasm', base).href;
+ console.log(`ort.env.wasm.wasmPaths['wasm'] = ${JSON.stringify(overrideWasmUrl)};`);
+ ort.env.wasm.wasmPaths.wasm = overrideWasmUrl;
+ }
+
+ if (typeof __ort_arg_files === 'string' && __ort_arg_files.includes('mjs')) {
+ const overrideMjsUrl = new URL('./test-wasm-path-override/jsep-renamed.mjs', base).href;
+ console.log(`ort.env.wasm.wasmPaths['mjs'] = ${JSON.stringify(overrideMjsUrl)};`);
+ ort.env.wasm.wasmPaths.mjs = overrideMjsUrl;
+ }
+
+ await testFunction(ort, { executionProviders: ['wasm'] });
+});
diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js
index 04079b042bc23..dbc3ca0bd2460 100644
--- a/js/web/test/e2e/run-data.js
+++ b/js/web/test/e2e/run-data.js
@@ -14,7 +14,7 @@ const NODEJS_TEST_CASES = [
// [test_for_same_origin, test_for_cross_origin, main_js, ort_main_js, [test_args]]
const BROWSER_TEST_CASES = [
// IIFE
- [true, true, './browser-test-webgl.js', 'ort.min.js'], // webgl
+ [true, true, './browser-test-webgl.js', 'ort.all.min.js'], // webgl
[true, true, './browser-test-webgl.js', 'ort.webgl.min.js'], // webgl
[true, true, './browser-test-wasm.js', 'ort.wasm.min.js'], // wasm, ort.wasm
[true, true, './browser-test-wasm-multi-session-create.js', 'ort.min.js'], // wasm, multi-session create
@@ -24,7 +24,7 @@ const BROWSER_TEST_CASES = [
[true, true, './browser-test-wasm.js', 'ort.min.js', ['num_threads=1', 'proxy=1']], // wasm, 1 thread, proxy
// ort.min.mjs
- [true, true, './browser-test-webgl.js', 'ort.min.mjs'], // webgl
+ [true, true, './browser-test-webgl.js', 'ort.webgl.min.mjs'], // webgl
[true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=1']], // wasm, 1 thread
[true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2']], // wasm, 2 threads
[true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2', 'proxy=1']], // wasm, 2 threads, proxy
@@ -41,22 +41,22 @@ const BROWSER_TEST_CASES = [
// path override:
// wasm, path override filenames for both mjs and wasm, same origin
- [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],
+ [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],
[true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=mjs,wasm']],
// wasm, path override filenames for both mjs and wasm, cross origin
- [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=mjs,wasm']],
+ [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=mjs,wasm']],
[false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=mjs,wasm']],
// wasm, path override filename for wasm, same origin
- [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=wasm']],
+ [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=wasm']],
[true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=wasm']],
// wasm, path override filename for wasm, cross origin
- [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=wasm']],
+ [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=wasm']],
[false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=wasm']],
// wasm, path override filename for mjs, same origin
- [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs']],
+ [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=mjs']],
[true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=mjs']],
// wasm, path override filename for mjs, cross origin
- [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=mjs']],
+ [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=mjs']],
[false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=mjs']],
// wasm, path override prefix, same origin
[true, false, './browser-test-wasm-path-override-prefix.js', 'ort.min.js', ['port=9876']],
diff --git a/js/web/test/e2e/run.js b/js/web/test/e2e/run.js
index 93f9d4a144bf2..3361bbece64ed 100644
--- a/js/web/test/e2e/run.js
+++ b/js/web/test/e2e/run.js
@@ -146,6 +146,10 @@ function prepareWasmPathOverrideFiles() {
fs.copyFileSync(`${sourceFile}.wasm`, path.join(folder, 'ort-wasm-simd-threaded.wasm'));
fs.copyFileSync(`${sourceFile}.mjs`, path.join(folder, 'renamed.mjs'));
fs.copyFileSync(`${sourceFile}.wasm`, path.join(folder, 'renamed.wasm'));
+ fs.copyFileSync(`${sourceFile}.jsep.mjs`, path.join(folder, 'ort-wasm-simd-threaded.jsep.mjs'));
+ fs.copyFileSync(`${sourceFile}.jsep.wasm`, path.join(folder, 'ort-wasm-simd-threaded.jsep.wasm'));
+ fs.copyFileSync(`${sourceFile}.jsep.mjs`, path.join(folder, 'jsep-renamed.mjs'));
+ fs.copyFileSync(`${sourceFile}.jsep.wasm`, path.join(folder, 'jsep-renamed.wasm'));
}
async function testAllNodejsCases() {
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 1f379e0de6165..f179756967d49 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -570,14 +570,14 @@
"test_greater_equal_expanded",
"test_greater_equal",
"test_greater",
- // // "test_gridsample_aligncorners_true",
- // // "test_gridsample_bicubic",
- // // "test_gridsample_bilinear",
- // // "test_gridsample_border_padding",
- // // "test_gridsample_nearest",
- // // "test_gridsample_reflection_padding",
- // // "test_gridsample_zeros_padding",
- // // "test_gridsample",
+ "test_gridsample_aligncorners_true",
+ "test_gridsample_bicubic",
+ "test_gridsample_bilinear",
+ "test_gridsample_border_padding",
+ "test_gridsample_nearest",
+ "test_gridsample_reflection_padding",
+ "test_gridsample_zeros_padding",
+ "test_gridsample",
// // "test_gru_batchwise",
// // "test_gru_defaults",
// // "test_gru_seq_length",
@@ -1365,6 +1365,7 @@
"gather.jsonc",
"gather-block-quantized.jsonc",
"gather-elements.jsonc",
+ "gather-nd.jsonc",
"gemm.jsonc",
"global-average-pool.jsonc",
"greater.jsonc",
@@ -1396,6 +1397,7 @@
"pow-big-number.jsonc",
"reshape.jsonc",
"rotary-embedding.jsonc",
+ "scatternd.jsonc",
"simplified-layer-norm.jsonc",
"skip-layer-norm.jsonc",
"skip-simplified-layer-norm.jsonc",
@@ -2362,14 +2364,14 @@
// "test_sinh",
// // "test_size_example",
// // "test_size",
- // "test_slice_default_axes",
- // "test_slice_default_steps",
- // "test_slice_end_out_of_bounds",
- // "test_slice_neg_steps",
- // "test_slice_neg",
- // "test_slice_negative_axes",
- // "test_slice_start_out_of_bounds",
- // "test_slice",
+ "test_slice_default_axes",
+ "test_slice_default_steps",
+ "test_slice_end_out_of_bounds",
+ "test_slice_neg_steps",
+ "test_slice_neg",
+ "test_slice_negative_axes",
+ "test_slice_start_out_of_bounds",
+ "test_slice",
// "test_softmax_axis_0_expanded",
"test_softmax_axis_0",
// "test_softmax_axis_1_expanded",
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index c37c10c781400..d54ba32f9f494 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -661,7 +661,7 @@ async function createMLTensorForOutput(mlContext: MLContext, type: ort.Tensor.Ty
shape: dims as number[],
// Assign both shape and dimensions while transitioning to new API.
dimensions: dims as number[],
- usage: MLTensorUsage.READ,
+ usage: typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.READ,
readable: true,
});
@@ -686,7 +686,7 @@ async function createMLTensorForInput(mlContext: MLContext, cpuTensor: ort.Tenso
shape: cpuTensor.dims as number[],
// Assign both shape and dimensions while transitioning to new API.
dimensions: cpuTensor.dims as number[],
- usage: MLTensorUsage.WRITE,
+ usage: typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.WRITE,
writable: true,
});
mlContext.writeTensor(mlTensor, cpuTensor.data);
diff --git a/objectivec/error_utils.mm b/objectivec/error_utils.mm
index 335cf8894d549..e8d4d5bb365c9 100644
--- a/objectivec/error_utils.mm
+++ b/objectivec/error_utils.mm
@@ -11,7 +11,7 @@ void ORTSaveCodeAndDescriptionToError(int code, const char* descriptionCstr, NSE
if (!error) return;
NSString* description = [NSString stringWithCString:descriptionCstr
- encoding:NSASCIIStringEncoding];
+ encoding:NSUTF8StringEncoding];
*error = [NSError errorWithDomain:kOrtErrorDomain
code:code
diff --git a/objectivec/include/ort_coreml_execution_provider.h b/objectivec/include/ort_coreml_execution_provider.h
index d7d873f5eb0e0..41d15aa39453a 100644
--- a/objectivec/include/ort_coreml_execution_provider.h
+++ b/objectivec/include/ort_coreml_execution_provider.h
@@ -70,7 +70,22 @@ NS_ASSUME_NONNULL_BEGIN
*/
- (BOOL)appendCoreMLExecutionProviderWithOptions:(ORTCoreMLExecutionProviderOptions*)options
error:(NSError**)error;
-
+/**
+ * Enables the CoreML execution provider in the session configuration options.
+ * It is appended to the execution provider list which is ordered by
+ * decreasing priority.
+ *
+ * @param provider_options The CoreML execution provider options in dict.
+ * available keys-values: more detail in core/providers/coreml/coreml_execution_provider.h
+ * kCoremlProviderOption_MLComputeUnits: one of "CPUAndNeuralEngine", "CPUAndGPU", "CPUOnly", "All"
+ * kCoremlProviderOption_ModelFormat: one of "MLProgram", "NeuralNetwork"
+ * kCoremlProviderOption_RequireStaticInputShapes: "1" or "0"
+ * kCoremlProviderOption_EnableOnSubgraphs: "1" or "0"
+ * @param error Optional error information set if an error occurs.
+ * @return Whether the provider was enabled successfully.
+ */
+- (BOOL)appendCoreMLExecutionProviderWithOptionsV2:(NSDictionary*)provider_options
+ error:(NSError**)error;
@end
NS_ASSUME_NONNULL_END
diff --git a/objectivec/ort_coreml_execution_provider.mm b/objectivec/ort_coreml_execution_provider.mm
index 6cb5026b93521..0c790a91fb8b9 100644
--- a/objectivec/ort_coreml_execution_provider.mm
+++ b/objectivec/ort_coreml_execution_provider.mm
@@ -43,6 +43,21 @@ - (BOOL)appendCoreMLExecutionProviderWithOptions:(ORTCoreMLExecutionProviderOpti
#endif
}
+- (BOOL)appendCoreMLExecutionProviderWithOptionsV2:(NSDictionary*)provider_options
+ error:(NSError**)error {
+#if ORT_OBJC_API_COREML_EP_AVAILABLE
+ try {
+ return [self appendExecutionProvider:@"CoreML" providerOptions:provider_options error:error];
+ }
+ ORT_OBJC_API_IMPL_CATCH_RETURNING_BOOL(error);
+
+#else // !ORT_OBJC_API_COREML_EP_AVAILABLE
+ static_cast(provider_options);
+ ORTSaveCodeAndDescriptionToError(ORT_FAIL, "CoreML execution provider is not enabled.", error);
+ return NO;
+#endif
+}
+
@end
NS_ASSUME_NONNULL_END
diff --git a/objectivec/test/ort_session_test.mm b/objectivec/test/ort_session_test.mm
index 508289f7bc748..409ee7e1584e2 100644
--- a/objectivec/test/ort_session_test.mm
+++ b/objectivec/test/ort_session_test.mm
@@ -223,6 +223,28 @@ - (void)testAppendCoreMLEP {
ORTAssertNullableResultSuccessful(session, err);
}
+- (void)testAppendCoreMLEP_v2 {
+ NSError* err = nil;
+ ORTSessionOptions* sessionOptions = [ORTSessionTest makeSessionOptions];
+ NSDictionary* provider_options = @{@"EnableOnSubgraphs" : @"1"}; // set an arbitrary option
+
+ BOOL appendResult = [sessionOptions appendCoreMLExecutionProviderWithOptionsV2:provider_options
+ error:&err];
+
+ if (!ORTIsCoreMLExecutionProviderAvailable()) {
+ ORTAssertBoolResultUnsuccessful(appendResult, err);
+ return;
+ }
+
+ ORTAssertBoolResultSuccessful(appendResult, err);
+
+ ORTSession* session = [[ORTSession alloc] initWithEnv:self.ortEnv
+ modelPath:[ORTSessionTest getAddModelPath]
+ sessionOptions:sessionOptions
+ error:&err];
+ ORTAssertNullableResultSuccessful(session, err);
+}
+
- (void)testAppendXnnpackEP {
NSError* err = nil;
ORTSessionOptions* sessionOptions = [ORTSessionTest makeSessionOptions];
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
index b15e865aa423c..ad14fb8258656 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -30,7 +30,6 @@ class Attention : public OpKernel, public AttentionCPUBase {
Status Compute(OpKernelContext* context) const override;
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
- bool save_prepacked_initializers,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override;
@@ -102,7 +101,6 @@ bool Attention::IsPackWeightsSuccessful(int qkv_index,
template
Status Attention::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
- bool /*save_prepacked_initializers*/,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
/* The PrePack() massages the weights to speed up Compute(), there is an option to
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
index 0bdee151d2173..4cc5a4228dc8c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -11,18 +11,19 @@ namespace onnxruntime {
namespace contrib {
namespace group_query_attention_helper {
-Status CheckInputs(const Tensor* query,
- const Tensor* key,
- const Tensor* value,
- const Tensor* past_key,
- const Tensor* past_value,
- const Tensor* cos_cache,
- const Tensor* sin_cache,
+template
+Status CheckInputs(const T* query,
+ const T* key,
+ const T* value,
+ const T* past_key,
+ const T* past_value,
+ const T* cos_cache,
+ const T* sin_cache,
void* parameters,
int num_heads,
int kv_num_heads,
- const Tensor* seqlens_k,
- const Tensor* total_seqlen,
+ const T* seqlens_k,
+ const T* total_seqlen,
float scale,
float softcap) {
// Note: Here S* is seqlen_past_kv_cache, S+ is seqlen_present_kv_cache
@@ -265,18 +266,19 @@ Status CheckInputs(const Tensor* query,
return Status::OK();
}
-Status CheckInputs(const Tensor* query,
- const Tensor* key,
- const Tensor* value,
- const Tensor* past_key,
- const Tensor* past_value,
- const Tensor* cos_cache,
- const Tensor* sin_cache,
+template
+Status CheckInputs(const T* query,
+ const T* key,
+ const T* value,
+ const T* past_key,
+ const T* past_value,
+ const T* cos_cache,
+ const T* sin_cache,
void* parameters,
int num_heads,
int kv_num_heads,
- const Tensor* seqlens_k,
- const Tensor* total_seqlen,
+ const T* seqlens_k,
+ const T* total_seqlen,
float scale,
float softcap,
int max_threads_per_block) {
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
index cbfd2f0949363..9a6c2af022c91 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
@@ -4,6 +4,7 @@
#include "contrib_ops/cpu/bert/rotary_embedding.h"
#include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
+#include "core/mlas/inc/mlas.h"
#include "core/platform/threadpool.h"
using onnxruntime::concurrency::ThreadPool;
@@ -78,31 +79,12 @@ Status RunRotaryEmbedding(concurrency::ThreadPool* tp, RotaryParameters paramete
const T* cos_data = cos_cache + cache_offset;
const T* sin_data = sin_cache + cache_offset;
- int cache_idx = 0;
- bool sign = false;
- int j = 0;
- for (int i = 0; i < rotary_emb_dim; i++) {
- if (interleaved) {
- cache_idx = (i / 2) % half_rotary_emb_dim;
- sign = i & 1;
- j = sign ? i - 1 : i + 1; // i - sign
- } else {
- cache_idx = i % half_rotary_emb_dim;
- sign = (i >= half_rotary_emb_dim);
- j = (i + half_rotary_emb_dim) % rotary_emb_dim;
- }
- float output_data_i = static_cast(input_data[i]) * static_cast(cos_data[cache_idx]);
- float input_data_j = static_cast(input_data[j]);
- float sin_data_cache_idx = static_cast(sin_data[cache_idx]);
- if (sign) {
- output_data_i += input_data_j * sin_data_cache_idx;
- } else {
- output_data_i -= input_data_j * sin_data_cache_idx;
- }
- output_data[i] = static_cast(output_data_i);
- }
- for (int i = rotary_emb_dim; i < head_size; i++) {
- output_data[i] = input_data[i];
+ MlasRotaryEmbedOneRow(input_data, sin_data, cos_data, rotary_emb_dim, interleaved, output_data);
+
+ if (rotary_emb_dim < head_size) {
+ std::memcpy(output_data + rotary_emb_dim,
+ input_data + rotary_emb_dim,
+ (head_size - rotary_emb_dim) * sizeof(T));
}
}
});
diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
index 71a66ea368943..2c897f183164f 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
@@ -24,7 +24,6 @@ class QAttention : public OpKernel, public AttentionCPUBase {
Status Compute(OpKernelContext* context) const override;
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
- bool save_prepacked_initializers,
bool& /*out*/ is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override;
@@ -59,7 +58,6 @@ QAttention::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
template
Status QAttention::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
- bool /*save_prepacked_initializers*/,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
if (1 != input_idx) {
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
index 4148aae4b9a35..aa47f365c0005 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
Status PrePack(const Tensor& tensor, int input_idx,
- AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
+ AllocatorPtr alloc, /*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override;
Status UseSharedPrePackedBuffers(std::vector& prepacked_buffers,
@@ -91,7 +91,6 @@ static void UseSharedPrePackedBuffersImpl(std::vector& prepacke
}
Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
- bool /*save_prepacked_initializers*/,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
is_packed = false;
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index cee3dfc6b3f28..c3e43f897c509 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -32,24 +32,47 @@ constexpr size_t A = 0,
bias = 5;
};
-int64_t GetAccuracyLevel(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
- const auto accuracy_level = std::clamp(accuracy_level_attr,
- static_cast(CompMostAccurate),
- static_cast(CompLeastAccurate));
-
- // Find a supported accuracy level that is not less accurate than the one given.
- // CompMostAccurate is always supported with the fallback implementation.
- // Note: A higher numeric accuracy level value means lower accuracy, so the comparison order is reversed.
- int64_t effective_accuracy_level = accuracy_level;
- for (; effective_accuracy_level > CompMostAccurate; --effective_accuracy_level) {
- const auto compute_type = static_cast(effective_accuracy_level);
- if (MlasIsSQNBitGemmAvailable(nbits, block_size, compute_type)) {
- break;
- }
- }
+typedef enum {
+ Level0, /*!< input fp32, accumulator fp32 */
+ Level1, /*!< input fp32, accumulator fp32 */
+ Level2, /*!< input fp16, accumulator fp16 */
+ Level3, /*!< input bf16, accumulator fp32 */
+ Level4, /*!< input int8, accumulator int32 */
+} ACCURACY_LEVEL;
+
+// T: A data type.
+template
+MLAS_QNBIT_GEMM_COMPUTE_TYPE
+GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+ // For Fp32, only accuracy level 1 or 4 makes sense.
+ // non-ARM CPU converts Fp16 to Fp32.
+ // By converting Fp32 to Fp16, precision becomes worse. And due to the casting,
+ // there is no performance gain.
+ if (accuracy_level_attr == static_cast(Level4) &&
+ MlasIsQNBitGemmAvailable(nbits, block_size, SQNBIT_CompInt8)) {
+ return SQNBIT_CompInt8;
+ }
+
+ return SQNBIT_CompFp32;
+}
- return effective_accuracy_level;
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+template <>
+MLAS_QNBIT_GEMM_COMPUTE_TYPE
+GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+ // For Fp16, only accuracy level 2 or 4 makes sense.
+ // By converting Fp16 to Fp32, there is not precision increase, and the performance
+ // becomes worse.
+ if (accuracy_level_attr == static_cast(Level4) &&
+ MlasIsQNBitGemmAvailable(nbits, block_size, HQNBIT_CompInt8)) {
+ return HQNBIT_CompInt8;
+ }
+
+ // if HQNBIT_CompFp16 is not supported, will fallback to unpacked computation.
+ return HQNBIT_CompFp16;
}
+#endif // !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64
+
} // namespace
bool GetType(const NodeArg& node_arg, int32_t& type) {
@@ -74,10 +97,9 @@ class MatMulNBits final : public OpKernel {
N_{narrow(info.GetAttr("N"))},
block_size_{narrow(info.GetAttr("block_size"))},
nbits_{narrow(info.GetAttr("bits"))},
- accuracy_level_{GetAccuracyLevel(nbits_, block_size_, info.GetAttr("accuracy_level"))},
has_g_idx_{info.GetInputCount() > InputIndex::g_idx && info.node().InputDefs()[InputIndex::g_idx]->Exists()},
has_bias_{info.GetInputCount() > InputIndex::bias && info.node().InputDefs()[InputIndex::bias]->Exists()},
- compute_type_{static_cast(accuracy_level_)} {
+ compute_type_{GetComputeType(nbits_, block_size_, info.GetAttr("accuracy_level"))} {
const auto& node = info.node();
auto input_defs = node.InputDefs();
const NodeArg* zero_point_arg =
@@ -98,36 +120,26 @@ class MatMulNBits final : public OpKernel {
Status Compute(OpKernelContext* context) const override;
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
- bool save_prepacked_initializers,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override;
- void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
-
Status UseSharedPrePackedBuffers(std::vector& prepacked_buffers, int input_idx,
/*out*/ bool& used_shared_buffers) override;
- std::optional GetPrePackTensor(int /*input_idx*/) override;
-
- Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
-
private:
const size_t K_;
const size_t N_;
const size_t block_size_;
const size_t nbits_;
- const int64_t accuracy_level_;
const bool has_g_idx_;
const bool has_bias_;
- const MLAS_SQNBIT_GEMM_COMPUTE_TYPE compute_type_;
+ const MLAS_QNBIT_GEMM_COMPUTE_TYPE compute_type_;
bool has_unquantized_zero_point_{false};
const bool column_wise_quant_{true};
IAllocatorUniquePtr packed_b_{};
size_t packed_b_size_{0};
IAllocatorUniquePtr scales_fp32_{};
IAllocatorUniquePtr bias_fp32_{};
- std::optional packed_tensor_{std::nullopt};
- MLDataType prepack_tensor_data_type_;
bool has_zp_input_{false};
@@ -152,27 +164,11 @@ class MatMulNBits final : public OpKernel {
Tensor* y,
AllocatorPtr& allocator,
concurrency::ThreadPool* thread_pool,
- const MatMulComputeHelper& helper) const {
- ORT_THROW("ComputeBPacked is not supported for T1 type.");
- }
+ const MatMulComputeHelper& helper) const;
};
-template
-void MatMulNBits::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
- if (input_idx == InputIndex::B) {
- prepack_tensor_data_type_ = tensor.DataType();
- }
-
- TensorShapeVector weights_dims = {static_cast((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
- packed_tensor_ = Tensor(prepack_tensor_data_type_,
- TensorShape(weights_dims),
- packed_b_.get(),
- OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
-}
-
template
Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
- bool save_prepacked_initializers,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -181,43 +177,40 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
return Status::OK();
}
- if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+ if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
return Status::OK();
}
if (input_idx == InputIndex::B) {
- packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+ packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
if (packed_b_size_ == 0) {
return Status::OK();
}
auto qptr = tensor.DataRaw();
packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true);
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
is_packed = true;
- } else if (compute_type_ == CompInt8) {
+ } else if (compute_type_ == SQNBIT_CompInt8) {
#ifdef MLAS_TARGET_AMD64_IX86
if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
auto sptr = tensor.Data();
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr,
- has_zp_input_, nullptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr,
+ has_zp_input_, nullptr, nullptr);
is_packed = false;
} else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
auto zptr = tensor.Data();
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
is_packed = false;
}
#endif // MLAS_TARGET_AMD64_IX86
}
- if (save_prepacked_initializers) {
- ConvertPrepackWeightIntoTensor(tensor, input_idx);
- }
-
return Status::OK();
}
+#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
+// Non-ARM-with-fp16-intrinsics fall back fp16 to fp32.
template <>
Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
- bool save_prepacked_initializers,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -239,64 +232,37 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou
return Status::OK();
}
- if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+ if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
return Status::OK();
}
if (input_idx == InputIndex::B) {
- packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+ packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
if (packed_b_size_ == 0) {
return Status::OK();
}
auto qptr = tensor.DataRaw();
packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true);
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
- nullptr, has_zp_input_, nullptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
+ nullptr, has_zp_input_, nullptr, nullptr);
is_packed = true;
- } else if (compute_type_ == CompInt8) {
+ } else if (compute_type_ == SQNBIT_CompInt8) {
#ifdef MLAS_TARGET_AMD64_IX86
if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
- scales_fp32_.get(), has_zp_input_, nullptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
+ scales_fp32_.get(), has_zp_input_, nullptr, nullptr);
is_packed = false;
} else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
auto zptr = tensor.Data();
- MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
- nullptr, has_zp_input_, zptr, nullptr);
+ MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
+ nullptr, has_zp_input_, zptr, nullptr);
is_packed = false;
}
#endif // MLAS_TARGET_AMD64_IX86
}
- if (save_prepacked_initializers) {
- ConvertPrepackWeightIntoTensor(tensor, input_idx);
- }
-
- return Status::OK();
-}
-
-template
-std::optional MatMulNBits::GetPrePackTensor(int input_idx) {
- // For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
- // During compute process, scales and zeros_points will keep as it is and only use prepacked
- // buffer to replace input_B.
- // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
- // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
- ORT_UNUSED_PARAMETER(input_idx);
- return std::move(packed_tensor_);
-}
-
-template
-Status MatMulNBits::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
- if (input_idx == 1) {
- // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
- // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
- // pass empty/default buffer deleter here.
- // const_cast here is temporary, will fix in follow up PR.
- packed_b_ = BufferUniquePtr(const_cast(pre_packed_tensor.DataRaw()), BufferDeleter());
- }
-
return Status::OK();
}
+#endif // end !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64
template
Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& prepacked_buffers, int input_idx,
@@ -311,20 +277,20 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector&
return Status::OK();
}
-template <>
-Status MatMulNBits::ComputeBPacked(const Tensor* a,
- const Tensor* scales,
- const Tensor* zero_points,
- const Tensor* bias,
- Tensor* y,
- AllocatorPtr& allocator,
- concurrency::ThreadPool* thread_pool,
- const MatMulComputeHelper& helper) const {
- const auto* a_data = a->Data();
- const auto* scales_data = scales->Data();
+template
+Status MatMulNBits::ComputeBPacked(const Tensor* a,
+ const Tensor* scales,
+ const Tensor* zero_points,
+ const Tensor* bias,
+ Tensor* y,
+ AllocatorPtr& allocator,
+ concurrency::ThreadPool* thread_pool,
+ const MatMulComputeHelper& helper) const {
+ const auto* a_data = a->Data();
+ const auto* scales_data = scales->Data();
const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
- const auto* bias_data = bias == nullptr ? nullptr : bias->Data();
- auto* y_data = y->MutableData();
+ const auto* bias_data = bias == nullptr ? nullptr : bias->Data();
+ auto* y_data = y->MutableData();
const size_t batch_count = helper.OutputOffsets().size();
const size_t M = static_cast(helper.M());
@@ -333,19 +299,19 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a,
const size_t lda = helper.Lda(false);
IAllocatorUniquePtr workspace{};
- const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+ const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
M, N, K, batch_count, nbits_, block_size_, compute_type_);
if (workspace_size > 0) {
// Use reserve since no caching is needed
workspace = IAllocator::MakeUniquePtr(allocator, workspace_size, true);
}
- InlinedVector data(batch_count);
+ InlinedVector> data(batch_count);
for (size_t i = 0; i < batch_count; ++i) {
data[i].A = a_data + helper.LeftOffsets()[i];
data[i].lda = lda;
#ifdef MLAS_TARGET_AMD64_IX86
- if (compute_type_ == CompInt8) {
+ if (compute_type_ == SQNBIT_CompInt8) {
data[i].QuantBDataWorkspace = packed_b_.get();
}
#endif
@@ -356,11 +322,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a,
data[i].C = y_data + helper.OutputOffsets()[i];
data[i].ldc = N;
}
- MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
- thread_pool);
+ MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
+ thread_pool);
return Status::OK();
}
+#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
template <>
Status MatMulNBits::ComputeBPacked(const Tensor* a,
const Tensor* scales,
@@ -383,7 +350,7 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a,
const size_t lda = helper.Lda(false);
IAllocatorUniquePtr workspace{};
- const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+ const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
M, N, K, batch_count, nbits_, block_size_, compute_type_);
if (workspace_size > 0) {
// Use reserve since no caching is needed
@@ -417,12 +384,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a,
size_t c_size = static_cast(y->Shape().Size());
std::vector c_v(c_size);
- InlinedVector data(batch_count);
+ InlinedVector> data(batch_count);
for (size_t i = 0; i < batch_count; ++i) {
data[i].A = tmp_a_data_ptr.get() + helper.LeftOffsets()[i];
data[i].lda = lda;
#ifdef MLAS_TARGET_AMD64_IX86
- if (compute_type_ == CompInt8) {
+ if (compute_type_ == SQNBIT_CompInt8) {
data[i].QuantBDataWorkspace = packed_b_.get();
}
#endif
@@ -433,11 +400,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a,
data[i].C = c_v.data() + helper.OutputOffsets()[i];
data[i].ldc = N;
}
- MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
- thread_pool);
+ MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
+ thread_pool);
MlasConvertFloatToHalfBuffer(c_v.data(), y_data, c_size);
return Status::OK();
}
+#endif // end of !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_AMD64
template <>
Status MatMulNBits::ComputeBUnpacked(const Tensor* a,
@@ -573,9 +541,10 @@ Status MatMulNBits::ComputeBUnpacked(const Tensor* a,
const size_t ldb = helper.Ldb(true);
float* scales_ptr = nullptr;
+ IAllocatorUniquePtr temp_scales;
if (!scales_fp32_) {
auto scales_size = static_cast(scales->Shape().Size());
- auto temp_scales = IAllocator::MakeUniquePtr(allocator, scales_size, true);
+ temp_scales = IAllocator::MakeUniquePtr(allocator, scales_size, true);
MlasConvertHalfToFloatBuffer(scales_data, temp_scales.get(), scales_size);
scales_ptr = temp_scales.get();
} else {
@@ -656,8 +625,9 @@ Status MatMulNBits::ComputeBUnpacked(const Tensor* a,
if (bias) {
float* bias_ptr = nullptr;
const size_t bias_size = static_cast(bias->Shape().Size());
+ IAllocatorUniquePtr bias_temp;
if (!bias_fp32_) {
- auto bias_temp = IAllocator::MakeUniquePtr(allocator, bias_size, true);
+ bias_temp = IAllocator::MakeUniquePtr(allocator, bias_size, true);
MlasConvertHalfToFloatBuffer(bias->Data(), bias_temp.get(), bias_size);
bias_ptr = bias_temp.get();
} else {
@@ -710,11 +680,11 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
// clang-format on
if (has_single_b_matrix &&
- packed_b_) { // Assume that MlasSQNBitGemmBatch() always requires packed B.
- // If this changes, i.e., if MlasIsSQNBitGemmAvailable() can return true while
- // MlasSQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasSQNBitGemmBatch()
+ packed_b_) { // Assume that MlasQNBitGemmBatch() always requires packed B.
+ // If this changes, i.e., if MlasIsQNBitGemmAvailable() can return true while
+ // MlasQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasQNBitGemmBatch()
// with B directly too.
- if (MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+ if (MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
return ComputeBPacked(a, scales, zero_points, bias, y, allocator, thread_pool, helper);
}
}
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
index c9ee9e2cb760d..d5b8961cf8c5a 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@@ -46,24 +46,13 @@ void ComputeJob(
const T* gamma_data,
const T* beta_data,
const T* bias_data,
- IAllocatorUniquePtr& skip_float_uptr,
- IAllocatorUniquePtr& gamma_float_uptr,
- IAllocatorUniquePtr& beta_float_uptr,
- IAllocatorUniquePtr& bias_float_uptr,
ptrdiff_t task_idx,
int hidden_size,
int64_t skip_size,
float epsilon,
bool simplified,
T* output_data,
- T* skip_input_bias_add_output_data,
- AllocatorPtr alloc) {
- ORT_UNUSED_PARAMETER(skip_float_uptr); // only used in MLFloat16 overload
- ORT_UNUSED_PARAMETER(gamma_float_uptr); // only used in MLFloat16 overload
- ORT_UNUSED_PARAMETER(beta_float_uptr); // only used in MLFloat16 overload
- ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload
- ORT_UNUSED_PARAMETER(alloc);
-
+ T* skip_input_bias_add_output_data) {
auto offset = task_idx * hidden_size;
const T* p_input = input_data + offset;
const T* p_skip = skip_data + (offset % skip_size);
@@ -107,101 +96,6 @@ void ComputeJob(
}
}
-void ComputeJob(
- const MLFloat16* input_data,
- const MLFloat16* skip_data,
- const MLFloat16* gamma_data,
- const MLFloat16* beta_data,
- const MLFloat16* bias_data,
- IAllocatorUniquePtr& skip_float_uptr,
- IAllocatorUniquePtr& gamma_float_uptr,
- IAllocatorUniquePtr& beta_float_uptr,
- IAllocatorUniquePtr& bias_float_uptr,
- ptrdiff_t task_idx,
- int hidden_size,
- int64_t skip_size,
- float epsilon,
- bool simplified,
- MLFloat16* output_data,
- MLFloat16* skip_input_bias_add_output_data,
- AllocatorPtr alloc) {
- auto offset = task_idx * hidden_size;
- const MLFloat16* p_input = input_data + offset;
- const MLFloat16* p_skip = skip_data + (offset % skip_size);
- MLFloat16* p_output = output_data + offset;
- MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset;
-
- float mean(0.0f);
- float mean_square(0.0f);
- const size_t num_elems = static_cast(hidden_size);
-
- IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems);
- MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems);
-
- if (!skip_float_uptr) {
- skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems);
- MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems);
- }
-
- if (bias_data && !bias_float_uptr) {
- bias_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems);
- MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
- }
-
- IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems);
- float* output_float_ptr = output_float_uptr.get();
-
- const float* input_float_ptr = input_float_uptr.get();
- const float* skip_float_ptr = skip_float_uptr.get();
- const float* bias_float_ptr = bias_float_uptr.get();
- for (size_t h = 0; h < num_elems; h++) {
- float val = input_float_ptr[h] + skip_float_ptr[h];
-
- if (bias_float_uptr) {
- val += bias_float_ptr[h];
- }
-
- output_float_ptr[h] = val;
- mean += val;
- mean_square += val * val;
- }
-
- if (nullptr != p_skip_input_bias_add_output) {
- MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems);
- }
-
- mean = mean / hidden_size;
- if (simplified) {
- mean_square = sqrt(mean_square / hidden_size + epsilon);
- } else {
- mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon);
- }
-
- if (!gamma_float_uptr) {
- gamma_float_uptr = std::move(input_float_uptr); // overwrite input with gamma values, since they have the same size
- MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems);
- }
-
- if (beta_data && !beta_float_uptr) {
- beta_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems);
- MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems);
- }
-
- const float* gamma_float_ptr = gamma_float_uptr.get();
- const float* beta_float_ptr = beta_float_uptr.get();
- for (size_t h = 0; h < num_elems; h++) {
- if (simplified) {
- output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h];
- } else if (nullptr == beta_float_uptr) {
- output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h];
- } else {
- output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h];
- }
- }
-
- MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems);
-}
-
void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) {
if (tensor.GetElementType() == utils::ToTensorProtoElementType()) {
auto tensor_data_ptr = tensor.Data();
@@ -218,7 +112,12 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I
template
SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
- : OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) {
+ : OpKernel(op_kernel_info),
+ prepacked_skip_fp32_size_(0),
+ prepacked_skip_fp32_data_(nullptr),
+ prepacked_gamma_fp32_data_(nullptr),
+ prepacked_beta_fp32_data_(nullptr),
+ prepacked_bias_fp32_data_(nullptr) {
ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK());
ORT_ENFORCE(epsilon_ >= 0);
}
@@ -226,10 +125,10 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
template