Skip to content

Commit

Permalink
Merge branch 'main' into sajandhy/webgpu_add_fused_convtranspose
Browse files Browse the repository at this point in the history
  • Loading branch information
satyajandhyala authored Oct 4, 2023
2 parents 84f2097 + 1bc1157 commit 62983e9
Show file tree
Hide file tree
Showing 63 changed files with 8,449 additions and 1,456 deletions.
57 changes: 21 additions & 36 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,46 +18,35 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
set(OUTPUT_STYLE xcode)
endif()

set(ONNXRUNTIME_PUBLIC_HEADERS
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_c_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_inline.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
)

if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
endif()

# This macro is to get the path of header files for mobile packaging, for iOS and Android
macro(get_mobile_api_headers _HEADERS)
# include both c and cxx api
set(${_HEADERS}
# Gets the public C/C++ API header files
function(get_c_cxx_api_headers HEADERS_VAR)
set(_headers
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_c_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_api.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_cxx_inline.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
)

if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND ${_HEADERS} "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h")
list(APPEND _headers "${REPO_ROOT}/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h")
endif()

# need to add header files for enabled EPs
foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND ${_HEADERS} "${_provider_headers}")
unset(_provider_headers)
list(APPEND _headers ${_provider_headers})
endforeach()
endmacro()

set(${HEADERS_VAR} ${_headers} PARENT_SCOPE)
endfunction()

get_c_cxx_api_headers(ONNXRUNTIME_PUBLIC_HEADERS)

#If you want to verify if there is any extra line in symbols.txt, run
# nm -C -g --defined libonnxruntime.so |grep -v '\sA\s' | cut -f 3 -d ' ' | sort
Expand All @@ -84,11 +73,9 @@ if(WIN32)
"${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
)
elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
get_mobile_api_headers(APPLE_FRAMEWORK_HEADERS)

# apple framework requires the header file be part of the library
onnxruntime_add_shared_library(onnxruntime
${APPLE_FRAMEWORK_HEADERS}
${ONNXRUNTIME_PUBLIC_HEADERS}
"${CMAKE_CURRENT_BINARY_DIR}/generated_source.c"
)

Expand All @@ -107,10 +94,9 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
set_target_properties(onnxruntime PROPERTIES
FRAMEWORK TRUE
FRAMEWORK_VERSION A
PUBLIC_HEADER "${APPLE_FRAMEWORK_HEADERS}"
MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_BINARY_DIR}/Info.plist
VERSION ${ORT_VERSION}
SOVERSION ${ORT_VERSION}
MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
SOVERSION ${ORT_VERSION}
# Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
)
else()
onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
Expand Down Expand Up @@ -180,11 +166,10 @@ endif()

# we need to copy C/C++ API headers to be packed into Android AAR package
if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
get_mobile_api_headers(ANDROID_AAR_HEADERS)
set(ANDROID_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/android/headers)
file(MAKE_DIRECTORY ${ANDROID_HEADERS_DIR})
# copy the header files one by one
foreach(h_ ${ANDROID_AAR_HEADERS})
foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
get_filename_component(HEADER_NAME_ ${h_} NAME)
add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${ANDROID_HEADERS_DIR}/${HEADER_NAME_})
endforeach()
Expand Down Expand Up @@ -328,7 +313,7 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_HEADER_DIR})

# copy the header files one by one, and the Info.plist
foreach(h_ ${APPLE_FRAMEWORK_HEADERS})
foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
get_filename_component(HEADER_NAME_ ${h_} NAME)
add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_})
endforeach()
Expand Down
3 changes: 3 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3597,6 +3597,9 @@ struct OrtApi {
* "rpc_control_latency": QNN RPC control latency.
* "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
* "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
* "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
* dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
* may alter model/EP partitioning. Use only for debugging.
*
* SNPE supported keys:
* "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
Expand Down
5 changes: 5 additions & 0 deletions js/common/lib/inference-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ export declare namespace InferenceSession {
wasm: WebAssemblyExecutionProviderOption;
webgl: WebGLExecutionProviderOption;
xnnpack: XnnpackExecutionProviderOption;
webgpu: WebGpuExecutionProviderOption;
webnn: WebNNExecutionProviderOption;
nnapi: NnapiExecutionProviderOption;
}
Expand Down Expand Up @@ -233,6 +234,10 @@ export declare namespace InferenceSession {
export interface XnnpackExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'xnnpack';
}
export interface WebGpuExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'webgpu';
preferredLayout?: 'NCHW'|'NHWC';
}
export interface WebNNExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'webnn';
deviceType?: 'cpu'|'gpu';
Expand Down
3 changes: 2 additions & 1 deletion js/react_native/e2e/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
},
"dependencies": {
"react": "^18.1.0",
"react-native": "^0.69.1"
"react-native": "^0.69.1",
"react-native-fs": "^2.20.0"
},
"devDependencies": {
"@babel/core": "^7.17.0",
Expand Down
16 changes: 14 additions & 2 deletions js/react_native/e2e/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { Image, Text, TextInput, View } from 'react-native';
import { InferenceSession, Tensor } from 'onnxruntime-react-native';
import MNIST, { MNISTInput, MNISTOutput, MNISTResult, } from './mnist-data-handler';
import { Buffer } from 'buffer';
import { readFile } from 'react-native-fs';

interface State {
session:
Expand Down Expand Up @@ -39,10 +40,21 @@ export default class App extends React.PureComponent<{}, State> {
this.setState({ imagePath });

const modelPath = await MNIST.getLocalModelPath();
const session: InferenceSession = await InferenceSession.create(modelPath);

// test creating session with path
console.log('Creating with path');
const pathSession: InferenceSession = await InferenceSession.create(modelPath);
pathSession.release();

// and with bytes
console.log('Creating with bytes');
const base64Str = await readFile(modelPath, 'base64');
const bytes = Buffer.from(base64Str, 'base64');
const session: InferenceSession = await InferenceSession.create(bytes);
this.setState({ session });

void this.infer();
console.log('Test session created');
void await this.infer();
} catch (err) {
console.log(err.message);
}
Expand Down
3 changes: 3 additions & 0 deletions js/web/docs/webgpu-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Do not modify directly.*
| Atan | ai.onnx(7+) | |
| Atanh | ai.onnx(9+) | |
| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(11+) | need perf optimization; need implementing activation |
| BiasAdd | com.microsoft(1+) | |
| BiasSplitGelu | com.microsoft(1+) | |
| Cast | ai.onnx(6-8,9-12,13-18,19+) | |
| Ceil | ai.onnx(6-12,13+) | |
| Clip | ai.onnx(6-10,11,12,13+) | |
Expand Down Expand Up @@ -96,3 +98,4 @@ Do not modify directly.*
| Tile | ai.onnx(6-12,13+) | |
| Transpose | ai.onnx(1-12,13+) | need perf optimization |
| Unsqueeze | ai.onnx(1-10,11-12,13+) | |
| Where | ai.onnx(9-15,16+) | |
6 changes: 6 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Licensed under the MIT License.

import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
import {biasAdd} from './ops/bias-add';
import {biasSplitGelu} from './ops/bias-split-gelu';
import * as binaryOps from './ops/binary-op';
import {concat, parseConcatAttributes} from './ops/concat';
import {conv, parseConvAttributes} from './ops/conv';
Expand All @@ -26,6 +28,7 @@ import {parseSplitAttributes, split} from './ops/split';
import {tile} from './ops/tile';
import {parseTransposeAttributes, transpose} from './ops/transpose';
import * as unaryOps from './ops/unary-op';
import {where} from './ops/where';
import {ComputeContext} from './types';

export type RunFunction = (context: ComputeContext, attribute?: unknown) => void;
Expand All @@ -45,6 +48,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['Atanh', [unaryOps.atanh]],
// TODO: support new attributes for AveragePool-10
['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
['BiasAdd', [biasAdd]],
['BiasSplitGelu', [biasSplitGelu]],
['Cast', [unaryOps.cast, unaryOps.parseCastAttributes]],
['Ceil', [unaryOps.ceil]],
['ClipV10', [unaryOps.clipV10]],
Expand Down Expand Up @@ -114,4 +119,5 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['ThresholdedRelu', [unaryOps.thresholdedRelu, unaryOps.parseAlphaAttributes]],
['Tile', [tile]],
['Transpose', [transpose, parseTransposeAttributes]],
['Where', [where]],
]);
69 changes: 69 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';

import {inputVariable, outputVariable, ShaderHelper} from './common';

const validateInputs = (inputs: readonly TensorView[]): void => {
if (inputs[0].dims.length !== 3) {
throw new Error('input should have 3 dimensions');
}

if (![320, 640, 1280].includes(inputs[0].dims[2])) {
throw new Error('number of channels should be 320, 640 or 1280');
}

if (inputs[1].dims.length !== 1) {
throw new Error('bias is expected to have 1 dimensions');
}

if (inputs[0].dims[2] !== inputs[1].dims[0]) {
throw new Error('last dimension of input and bias are not the same');
}
};

const createBiasAddProgramInfo = (metadata: ProgramMetadata, inputs: readonly TensorView[]): ProgramInfo => {
const outputShape = inputs[0].dims;

const channels = inputs[0].dims[2];
// since channel number can be only 320/640/1280, it's always divisable by 4
const outputSize = ShapeUtil.size(outputShape) / 4;

const dataType = inputs[0].dataType;
const input = inputVariable('input', dataType, outputShape, 4);
const bias = inputVariable('bias', dataType, [channels], 4);
const residual = inputVariable('residual', dataType, outputShape, 4);
const output = outputVariable('output', dataType, outputShape, 4);

const getShaderSource = (shaderHelper: ShaderHelper) => `
const channels = ${channels}u / 4;
${shaderHelper.declareVariables(input, bias, residual, output)}
${shaderHelper.mainStart()}
${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
let value = ${input.getByOffset('global_idx')}
+ ${bias.getByOffset('global_idx % channels')} + ${residual.getByOffset('global_idx')};
${output.setByOffset('global_idx', 'value')}
}`;

return {
...metadata,
outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
getShaderSource,
dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
};
};

export const biasAdd = (context: ComputeContext): void => {
validateInputs(context.inputs);
const inputTypes = Array(context.inputs.length).fill(GpuDataType.default);
const metadata = {
name: 'BiasAdd',
inputTypes,
};

context.compute(createBiasAddProgramInfo(metadata, context.inputs));
};
76 changes: 76 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';

import {inputVariable, outputVariable, ShaderHelper} from './common';
import {erfImpl} from './unary-op';

const validateInputs = (inputs: readonly TensorView[]): void => {
if (inputs[0].dims.length !== 3) {
throw new Error('input should have 3 dimensions');
}

if (![2560, 5120, 10240].includes(inputs[0].dims[2])) {
throw new Error('hidden state should be 2560, 5120 or 10240');
}

if (inputs[1].dims.length !== 1) {
throw new Error('bias is expected to have 1 dimensions');
}

if (inputs[0].dims[2] !== inputs[1].dims[0]) {
throw new Error('last dimension of input and bias are not the same');
}
};

const createBiasSplitGeluProgramInfo = (metadata: ProgramMetadata, inputs: readonly TensorView[]): ProgramInfo => {
const outputShape = inputs[0].dims.slice();
outputShape[2] = outputShape[2] / 2;

const input = inputVariable('input', inputs[0].dataType, inputs[0].dims, 4);
const bias = inputVariable('bias', inputs[0].dataType, [inputs[0].dims[2]], 4);
const output = outputVariable('output', inputs[0].dataType, outputShape, 4);

const outputSize = ShapeUtil.size(outputShape) / 4;

const getShaderSource = (shaderHelper: ShaderHelper) => `
const M_SQRT2 = sqrt(2.0);
const halfChannels = ${inputs[0].dims[2] / 4 / 2}u;
${shaderHelper.declareVariables(input, bias, output)}
${erfImpl('vec4f')}
${shaderHelper.mainStart()}
${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
let biasIdx = global_idx % halfChannels;
let batchIndex = global_idx / halfChannels;
let inputOffset = biasIdx + batchIndex * halfChannels * 2;
let valueLeft = input[inputOffset] + bias[biasIdx];
let valueRight = input[inputOffset + halfChannels] + bias[biasIdx + halfChannels];
let geluRight = valueRight * 0.5 * (erf_vf32(valueRight / M_SQRT2) + 1);
${output.setByOffset('global_idx', 'valueLeft * geluRight')}
}`;

return {
...metadata,
outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
getShaderSource,
dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
};
};

export const biasSplitGelu = (context: ComputeContext): void => {
validateInputs(context.inputs);

const metadata = {
name: 'BiasSplitGelu',
inputTypes: [GpuDataType.default, GpuDataType.default],
};

context.compute(createBiasSplitGeluProgramInfo(metadata, context.inputs));
};
Loading

0 comments on commit 62983e9

Please sign in to comment.