Skip to content

Commit

Permalink
Merge branch 'main' into user/sumita/padfusion
Browse files Browse the repository at this point in the history
  • Loading branch information
sumitsays committed Nov 1, 2023
2 parents b33bdaa + 41e21ab commit 75425d7
Show file tree
Hide file tree
Showing 193 changed files with 10,908 additions and 2,425 deletions.
22 changes: 0 additions & 22 deletions .github/stale.yml

This file was deleted.

34 changes: 34 additions & 0 deletions .github/workflows/stale.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Close stale issues
on:
# Allows you to dictate when you want this workflow to run using cron syntax (times in UTC)
schedule:
- cron: "0 15 * * *"
# Allows you to run this workflow manually from the Actions tab
# workflow_dispatch:

jobs:
close-stale-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/[email protected]
with:
# Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
exempt-issue-labels: contributions welcome, feature request, regression
# Number of days without activity before the actions/stale action labels an issue
days-before-issue-stale: 30
# Number of days without activity before the actions/stale action closes an issue
days-before-issue-close: 7
# Label you want to apply to issues that have been inactive for the amount of time specified by days-before-issue-stale
stale-issue-label: "stale"
# Comment that you want to add to issues that are labeled by the actions/stale action
stale-issue-message: "This issue has been automatically marked as stale due to inactivity and will be closed in 7 days if no further activity occurs. If further support is needed, please provide an update and/or more details."
# Comment that you want to add to issues that are closed by the actions/stale action
close-issue-message: "This issue has been automatically closed due to inactivity. Please reactivate if further support is needed."
# If you never want this action to label PRs, set this value to -1
days-before-pr-stale: -1
# If you never want this action to close PRs, set this value to -1
days-before-pr-close: -1
repo-token: ${{ secrets.GITHUB_TOKEN }}
12 changes: 1 addition & 11 deletions cgmanifests/generated/cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "0c296085f9f65f0f8ef7aec7b9eed55faf37dc40",
"commitHash": "b86cc54efce19530fb953e4b21f57e6b3888534c",
"repositoryUrl": "https://github.com/onnx/onnx.git"
},
"comments": "git submodule at cmake/external/onnx"
Expand Down Expand Up @@ -192,16 +192,6 @@
"comments": "mp11"
}
},
{
"component": {
"type": "git",
"git": {
"commitHash": "6a20ba82b439ea1fd650da4d389e96b60a1dd828",
"repositoryUrl": "https://github.com/onnx/onnx.git"
},
"comments": "onnx"
}
},
{
"component": {
"type": "git",
Expand Down
18 changes: 0 additions & 18 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1282,14 +1282,6 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP16)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_U8)
add_definitions(-DOPENVINO_CONFIG_VPUX_U8=1)
endif()

if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
Expand All @@ -1310,16 +1302,6 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP32_NP)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP32=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP16_NP)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_HETERO)
add_definitions(-DOPENVINO_CONFIG_HETERO=1)
add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")
Expand Down
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
onnx;https://github.com/onnx/onnx/archive/6a20ba82b439ea1fd650da4d389e96b60a1dd828.zip;179a22ad4cd67109c60031ae4b6cf2f434d8bd7e
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
#use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/onnx
4 changes: 3 additions & 1 deletion cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/sharding.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_matmul.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_slice.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reshape.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_expand.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
Expand Down Expand Up @@ -246,4 +248,4 @@
install(TARGETS onnxruntime_providers_cuda
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
7 changes: 7 additions & 0 deletions cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,9 @@ if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/*"
)
file(GLOB onnxruntime_python_ortmodule_graph_optimizers_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/graph_optimizers/*"
)
file(GLOB onnxruntime_python_ort_triton_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/*.py"
)
Expand Down Expand Up @@ -741,6 +744,7 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/kernel
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils
Expand Down Expand Up @@ -794,6 +798,9 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_graph_optimizers_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ort_triton_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/
Expand Down
6 changes: 6 additions & 0 deletions cmake/onnxruntime_rocm_hipify.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ set(contrib_ops_excluded_files
"diffusion/group_norm_impl.cu"
"diffusion/group_norm_impl.h"
"diffusion/nhwc_conv.cc"
"math/gemm_float8.cc"
"math/gemm_float8.cu"
"math/gemm_float8.h"
"quantization/attention_quantization.cc"
"quantization/attention_quantization.h"
"quantization/attention_quantization_impl.cu"
Expand Down Expand Up @@ -103,6 +106,9 @@ if (NOT onnxruntime_USE_NCCL)
list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
endif()

set(provider_excluded_files
Expand Down
136 changes: 135 additions & 1 deletion docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Do not modify directly.*
* <a href="#com.microsoft.GatherND">com.microsoft.GatherND</a>
* <a href="#com.microsoft.Gelu">com.microsoft.Gelu</a>
* <a href="#com.microsoft.GemmFastGelu">com.microsoft.GemmFastGelu</a>
* <a href="#com.microsoft.GemmFloat8">com.microsoft.GemmFloat8</a>
* <a href="#com.microsoft.GreedySearch">com.microsoft.GreedySearch</a>
* <a href="#com.microsoft.GridSample">com.microsoft.GridSample</a>
* <a href="#com.microsoft.GroupNorm">com.microsoft.GroupNorm</a>
Expand Down Expand Up @@ -94,6 +95,7 @@ Do not modify directly.*
* <a href="#com.microsoft.RotaryEmbedding">com.microsoft.RotaryEmbedding</a>
* <a href="#com.microsoft.SampleOp">com.microsoft.SampleOp</a>
* <a href="#com.microsoft.Sampling">com.microsoft.Sampling</a>
* <a href="#com.microsoft.SkipGroupNorm">com.microsoft.SkipGroupNorm</a>
* <a href="#com.microsoft.SkipLayerNormalization">com.microsoft.SkipLayerNormalization</a>
* <a href="#com.microsoft.SkipSimplifiedLayerNormalization">com.microsoft.SkipSimplifiedLayerNormalization</a>
* <a href="#com.microsoft.Snpe">com.microsoft.Snpe</a>
Expand Down Expand Up @@ -2137,6 +2139,71 @@ This version of the operator has been available since version 1 of the 'com.micr
</dl>


### <a name="com.microsoft.GemmFloat8"></a><a name="com.microsoft.gemmfloat8">**com.microsoft.GemmFloat8**</a>

Generic Gemm for float and float 8.

#### Version

This version of the operator has been available since version 1 of the 'com.microsoft' operator set.

#### Attributes

<dl>
<dt><tt>activation</tt> : string</dt>
<dd>Activation function, RELU or GELU or NONE (default).</dd>
<dt><tt>alpha</tt> : float</dt>
<dd>Scalar multiplier for the product of input tensors A * B.</dd>
<dt><tt>beta</tt> : float</dt>
<dd>Scalar multiplier for the product of input bias C.</dd>
<dt><tt>dtype</tt> : int</dt>
<dd>Output Type. Same definition as attribute 'to' for operator Cast.</dd>
<dt><tt>transA</tt> : int</dt>
<dd>Whether A should be transposed. Float 8 only supprted transA=0.</dd>
<dt><tt>transB</tt> : int</dt>
<dd>Whether B should be transposed. Float 8 only supprted transB=1.</dd>
</dl>

#### Inputs (2 - 6)

<dl>
<dt><tt>A</tt> : TA</dt>
<dd>Input tensor A. The shape of A should be (M, K) if transA is 0, or (K, M) if transA is non-zero.</dd>
<dt><tt>B</tt> : TB</dt>
<dd>Input tensor B. The shape of B should be (K, N) if transB is 0, or (N, K) if transB is non-zero.</dd>
<dt><tt>C</tt> (optional) : TC</dt>
<dd>Input tensor C.</dd>
<dt><tt>scaleA</tt> (optional) : TS</dt>
<dd>Scale of tensor A if A is float 8 tensor</dd>
<dt><tt>scaleB</tt> (optional) : TS</dt>
<dd>Scale of tensor B if B is float 8 tensor</dd>
<dt><tt>scaleY</tt> (optional) : TS</dt>
<dd>Scale of the output tensor if A or B is float 8.</dd>
</dl>

#### Outputs

<dl>
<dt><tt>Y</tt> : TR</dt>
<dd>Output tensor of shape (M, N).</dd>
</dl>

#### Type Constraints

<dl>
<dt><tt>TA</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
<dd>Constrain type to input A.</dd>
<dt><tt>TB</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
<dd>Constrain type to input B.</dd>
<dt><tt>TC</tt> : tensor(float16), tensor(bfloat16), tensor(float)</dt>
<dd>Constrain type to input C.</dd>
<dt><tt>TR</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
<dd>Constrain type to result type.</dd>
<dt><tt>TS</tt> : tensor(float)</dt>
<dd>Constrain type for all input scales (scaleA, scaleB, scaleY).</dd>
</dl>


### <a name="com.microsoft.GreedySearch"></a><a name="com.microsoft.greedysearch">**com.microsoft.GreedySearch**</a>

Greedy Search for text generation.
Expand Down Expand Up @@ -2276,7 +2343,7 @@ This version of the operator has been available since version 1 of the 'com.micr

<dl>
<dt><tt>activation</tt> : int (required)</dt>
<dd>Activation after group normalization: 0 for None, 1 for Swish</dd>
<dd>Activation after group normalization: 0 for None, 1 for SiLU</dd>
<dt><tt>channels_last</tt> : int</dt>
<dd>1 if the input and output are in the NHWC layout, 0 if it is in the NCHW layout. Defaults to 1.</dd>
<dt><tt>epsilon</tt> : float</dt>
Expand Down Expand Up @@ -2516,6 +2583,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Input B is stored as uint8_t with shape: [(N * K + 1) / 2].
Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size].


#### Version

This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
Expand Down Expand Up @@ -5017,6 +5085,72 @@ This version of the operator has been available since version 1 of the 'com.micr
</dl>


### <a name="com.microsoft.SkipGroupNorm"></a><a name="com.microsoft.skipgroupnorm">**com.microsoft.SkipGroupNorm**</a>

This operator element-wise adds x, skip and bias, then apply group normalization and optional activation.

This operator transforms input according to
s = x + skip + bias
y = gamma * (s - mean) / sqrt(variance + epsilon) + beta

The input channels are separated into num_groups groups, each containing num_channels / num_groups channels.
The num_channels must be divisible by num_groups.
The mean and standard-deviation of s are calculated separately over the each group.
The weight and bias are per-channel affine transform parameter vectors of size num_channels.

The activation attribute can be used to enable activation after group normalization.

#### Version

This version of the operator has been available since version 1 of the 'com.microsoft' operator set.

#### Attributes

<dl>
<dt><tt>activation</tt> : int (required)</dt>
<dd>Activation after group normalization: 0 for None, 1 for SiLU</dd>
<dt><tt>channels_last</tt> : int</dt>
<dd>1 if the input and output are in the NHWC layout, 0 if it is in the NCHW layout. Defaults to 1.</dd>
<dt><tt>epsilon</tt> : float</dt>
<dd>The epsilon value to use to avoid division by zero</dd>
<dt><tt>groups</tt> : int (required)</dt>
<dd>The number of groups of channels. It should be a divisor of the number of channels C</dd>
</dl>

#### Inputs (4 - 5)

<dl>
<dt><tt>X</tt> : T</dt>
<dd>Input data tensor. Dimensions are (N x H x W x C) when channels_last is 1 or (N x C x H x W) otherwise, where N is the batch size, C is the number of channels, and H and W are the height and width of the data</dd>
<dt><tt>gamma</tt> : M</dt>
<dd>1D gamma tensor for normalization with shape (C), where C is number of channels</dd>
<dt><tt>beta</tt> : M</dt>
<dd>1D beta tensor for normalization with shape (C), where C is number of channels</dd>
<dt><tt>skip</tt> : T</dt>
<dd>4D or 2D skip tensor. The shape can be (N x H x W x C) or (N x 1 x 1 x C) or (N x C)</dd>
<dt><tt>bias</tt> (optional) : T</dt>
<dd>1D bias tensor. Dimensions are (C), where C is number of channels</dd>
</dl>

#### Outputs (1 - 2)

<dl>
<dt><tt>Y</tt> : T</dt>
<dd>The output tensor of the same shape as X</dd>
<dt><tt>S</tt> (optional) : T</dt>
<dd>The element-wise sum of input x, skip and bias tensors. It has the same shape as X</dd>
</dl>

#### Type Constraints

<dl>
<dt><tt>T</tt> : tensor(float16), tensor(float)</dt>
<dd>Constrain input X, skip, bias and output Y, S types to float tensors.</dd>
<dt><tt>M</tt> : tensor(float16), tensor(float)</dt>
<dd>Constrain gamma and beta to float tensors.</dd>
</dl>


### <a name="com.microsoft.SkipLayerNormalization"></a><a name="com.microsoft.skiplayernormalization">**com.microsoft.SkipLayerNormalization**</a>

Skip and Layer Normalization Fusion
Expand Down
Loading

0 comments on commit 75425d7

Please sign in to comment.