Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into snnn/webgpu_initial…
Browse files Browse the repository at this point in the history
…_get_capability
  • Loading branch information
snnn committed Oct 5, 2023
2 parents e9c943a + 5be79e2 commit 2b7fe0a
Show file tree
Hide file tree
Showing 104 changed files with 10,889 additions and 1,746 deletions.
109 changes: 0 additions & 109 deletions Package.swift

This file was deleted.

57 changes: 21 additions & 36 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,46 +18,35 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
set(OUTPUT_STYLE xcode)
endif()

set(ONNXRUNTIME_PUBLIC_HEADERS
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_c_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_inline.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
)

if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
endif()

# This macro is to get the path of header files for mobile packaging, for iOS and Android
macro(get_mobile_api_headers _HEADERS)
# include both c and cxx api
set(${_HEADERS}
# Gets the public C/C++ API header files
function(get_c_cxx_api_headers HEADERS_VAR)
set(_headers
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_c_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_inline.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
)

if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
endif()

# need to add header files for enabled EPs
foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND ${_HEADERS} "${_provider_headers}")
unset(_provider_headers)
list(APPEND _headers ${_provider_headers})
endforeach()
endmacro()

set(${HEADERS_VAR} ${_headers} PARENT_SCOPE)
endfunction()

get_c_cxx_api_headers(ONNXRUNTIME_PUBLIC_HEADERS)

#If you want to verify if there is any extra line in symbols.txt, run
# nm -C -g --defined libonnxruntime.so |grep -v '\sA\s' | cut -f 3 -d ' ' | sort
Expand All @@ -84,11 +73,9 @@ if(WIN32)
"${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
)
elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
get_mobile_api_headers(APPLE_FRAMEWORK_HEADERS)

# apple framework requires the header file be part of the library
onnxruntime_add_shared_library(onnxruntime
${APPLE_FRAMEWORK_HEADERS}
${ONNXRUNTIME_PUBLIC_HEADERS}
"${CMAKE_CURRENT_BINARY_DIR}/generated_source.c"
)

Expand All @@ -107,10 +94,9 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
set_target_properties(onnxruntime PROPERTIES
FRAMEWORK TRUE
FRAMEWORK_VERSION A
PUBLIC_HEADER "${APPLE_FRAMEWORK_HEADERS}"
MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_BINARY_DIR}/Info.plist
VERSION ${ORT_VERSION}
SOVERSION ${ORT_VERSION}
MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
SOVERSION ${ORT_VERSION}
# Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
)
else()
onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
Expand Down Expand Up @@ -180,11 +166,10 @@ endif()

# we need to copy C/C++ API headers to be packed into Android AAR package
if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
get_mobile_api_headers(ANDROID_AAR_HEADERS)
set(ANDROID_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/android/headers)
file(MAKE_DIRECTORY ${ANDROID_HEADERS_DIR})
# copy the header files one by one
foreach(h_ ${ANDROID_AAR_HEADERS})
foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
get_filename_component(HEADER_NAME_ ${h_} NAME)
add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${ANDROID_HEADERS_DIR}/${HEADER_NAME_})
endforeach()
Expand Down Expand Up @@ -328,7 +313,7 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_HEADER_DIR})

# copy the header files one by one, and the Info.plist
foreach(h_ ${APPLE_FRAMEWORK_HEADERS})
foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
get_filename_component(HEADER_NAME_ ${h_} NAME)
add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_})
endforeach()
Expand Down
4 changes: 3 additions & 1 deletion cmake/onnxruntime_providers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,9 @@ if (onnxruntime_USE_CUDA)
if (NOT onnxruntime_USE_NCCL)
list(REMOVE_ITEM onnxruntime_cuda_contrib_ops_cc_srcs
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/nccl_kernels.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/sharding_spec.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/sharding.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_matmul.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
Expand Down Expand Up @@ -452,7 +455,6 @@ if (onnxruntime_USE_CUDA)
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
)

list(REMOVE_ITEM onnxruntime_providers_cuda_src ${onnxruntime_cuda_nccl_op_srcs})
endif()
endif()
Expand Down
5 changes: 5 additions & 0 deletions cmake/onnxruntime_rocm_hipify.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,12 @@ if (NOT onnxruntime_ENABLE_ATEN)
list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
endif()
if (NOT onnxruntime_USE_NCCL)
# Those are string patterns to exclude. Do NOT use stars such as
# collective/*.cc or *.h.
list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
endif()

set(provider_excluded_files
Expand Down
3 changes: 3 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3597,6 +3597,9 @@ struct OrtApi {
* "rpc_control_latency": QNN RPC control latency.
* "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
* "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
* "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
* dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
* may alter model/EP partitioning. Use only for debugging.
*
* SNPE supported keys:
* "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
Expand Down
5 changes: 5 additions & 0 deletions js/common/lib/inference-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ export declare namespace InferenceSession {
wasm: WebAssemblyExecutionProviderOption;
webgl: WebGLExecutionProviderOption;
xnnpack: XnnpackExecutionProviderOption;
webgpu: WebGpuExecutionProviderOption;
webnn: WebNNExecutionProviderOption;
nnapi: NnapiExecutionProviderOption;
}
Expand Down Expand Up @@ -233,6 +234,10 @@ export declare namespace InferenceSession {
export interface XnnpackExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'xnnpack';
}
export interface WebGpuExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'webgpu';
preferredLayout?: 'NCHW'|'NHWC';
}
export interface WebNNExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'webnn';
deviceType?: 'cpu'|'gpu';
Expand Down
3 changes: 3 additions & 0 deletions js/web/docs/webgpu-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Do not modify directly.*
| Atan | ai.onnx(7+) | |
| Atanh | ai.onnx(9+) | |
| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(11+) | need perf optimization; need implementing activation |
| BiasAdd | com.microsoft(1+) | |
| BiasSplitGelu | com.microsoft(1+) | |
| Cast | ai.onnx(6-8,9-12,13-18,19+) | |
| Ceil | ai.onnx(6-12,13+) | |
| Clip | ai.onnx(6-10,11,12,13+) | |
Expand Down Expand Up @@ -94,3 +96,4 @@ Do not modify directly.*
| Tile | ai.onnx(6-12,13+) | |
| Transpose | ai.onnx(1-12,13+) | need perf optimization |
| Unsqueeze | ai.onnx(1-10,11-12,13+) | |
| Where | ai.onnx(9-15,16+) | |
6 changes: 6 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Licensed under the MIT License.

import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
import {biasAdd} from './ops/bias-add';
import {biasSplitGelu} from './ops/bias-split-gelu';
import * as binaryOps from './ops/binary-op';
import {concat, parseConcatAttributes} from './ops/concat';
import {conv, parseConvAttributes} from './ops/conv';
Expand All @@ -26,6 +28,7 @@ import {parseSplitAttributes, split} from './ops/split';
import {tile} from './ops/tile';
import {parseTransposeAttributes, transpose} from './ops/transpose';
import * as unaryOps from './ops/unary-op';
import {where} from './ops/where';
import {ComputeContext} from './types';

export type RunFunction = (context: ComputeContext, attribute?: unknown) => void;
Expand All @@ -45,6 +48,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['Atanh', [unaryOps.atanh]],
// TODO: support new attributes for AveragePool-10
['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
['BiasAdd', [biasAdd]],
['BiasSplitGelu', [biasSplitGelu]],
['Cast', [unaryOps.cast, unaryOps.parseCastAttributes]],
['Ceil', [unaryOps.ceil]],
['ClipV10', [unaryOps.clipV10]],
Expand Down Expand Up @@ -112,4 +117,5 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['ThresholdedRelu', [unaryOps.thresholdedRelu, unaryOps.parseAlphaAttributes]],
['Tile', [tile]],
['Transpose', [transpose, parseTransposeAttributes]],
['Where', [where]],
]);
11 changes: 4 additions & 7 deletions js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,17 +163,14 @@ export const createConv2DMatMulProgramInfo =
const outWidth = isChannelsLast ? outputShape[2] : outputShape[3];
const outHeight = isChannelsLast ? outputShape[1] : outputShape[2];
const outChannels = isChannelsLast ? outputShape[3] : outputShape[1];
const isVec4 = (((inChannels % 4 === 0 || inChannels % 3 === 0) && isChannelsLast) ||
(outWidth % 4 === 0 && !isChannelsLast)) &&
outChannels % 4 === 0;
// TODO: enable vec4 for NCHW
const isVec4 = isChannelsLast && (inChannels % 4 === 0 || inChannels % 3 === 0) && outChannels % 4 === 0;

// TODO: fine tune size
const dispatchX = isChannelsLast ? outChannels : outWidth * outHeight;
const dispatchY = isChannelsLast ? outWidth * outHeight : outChannels;
const workGroupSize: [number, number, number] =
isVec4 ? [8, 8, 1] : [dispatchX <= 4 ? 4 : 16, dispatchX > 4 && dispatchY <= 4 ? 4 : 16, 1];
const elementsPerThread =
isVec4 ? [4, 4, 1] : [dispatchX <= 4 ? 1 : 2, dispatchX > 4 && dispatchY <= 4 ? 1 : 2, 1];
const workGroupSize: [number, number, number] = [8, 8, 1];
const elementsPerThread = dimAOuter <= 8 ? [4, 1, 1] : [4, 4, 1];
const dispatch = [
Math.ceil(dispatchX / workGroupSize[0] / elementsPerThread[0]),
Math.ceil(dispatchY / workGroupSize[1] / elementsPerThread[1]),
Expand Down
Loading

0 comments on commit 2b7fe0a

Please sign in to comment.