Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into prathikrao/where-op-bfloat16
Browse files Browse the repository at this point in the history
  • Loading branch information
Prathik Rao committed Nov 1, 2023
2 parents 40df5af + 9e8ad39 commit 76e7c79
Show file tree
Hide file tree
Showing 104 changed files with 4,109 additions and 1,134 deletions.
12 changes: 1 addition & 11 deletions cgmanifests/generated/cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "0c296085f9f65f0f8ef7aec7b9eed55faf37dc40",
"commitHash": "b86cc54efce19530fb953e4b21f57e6b3888534c",
"repositoryUrl": "https://github.com/onnx/onnx.git"
},
"comments": "git submodule at cmake/external/onnx"
Expand Down Expand Up @@ -192,16 +192,6 @@
"comments": "mp11"
}
},
{
"component": {
"type": "git",
"git": {
"commitHash": "6a20ba82b439ea1fd650da4d389e96b60a1dd828",
"repositoryUrl": "https://github.com/onnx/onnx.git"
},
"comments": "onnx"
}
},
{
"component": {
"type": "git",
Expand Down
18 changes: 0 additions & 18 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1282,14 +1282,6 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP16)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_U8)
add_definitions(-DOPENVINO_CONFIG_VPUX_U8=1)
endif()

if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
Expand All @@ -1310,16 +1302,6 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP32_NP)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP32=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_VPUX_FP16_NP)
add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_HETERO)
add_definitions(-DOPENVINO_CONFIG_HETERO=1)
add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")
Expand Down
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
onnx;https://github.com/onnx/onnx/archive/6a20ba82b439ea1fd650da4d389e96b60a1dd828.zip;179a22ad4cd67109c60031ae4b6cf2f434d8bd7e
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
#use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/onnx
1 change: 1 addition & 0 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_slice.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reshape.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_expand.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reduce.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime_rocm_hipify.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ if (NOT onnxruntime_USE_NCCL)
list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
endif()

set(provider_excluded_files
Expand Down
70 changes: 69 additions & 1 deletion docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ Do not modify directly.*
* <a href="#com.microsoft.RotaryEmbedding">com.microsoft.RotaryEmbedding</a>
* <a href="#com.microsoft.SampleOp">com.microsoft.SampleOp</a>
* <a href="#com.microsoft.Sampling">com.microsoft.Sampling</a>
* <a href="#com.microsoft.SkipGroupNorm">com.microsoft.SkipGroupNorm</a>
* <a href="#com.microsoft.SkipLayerNormalization">com.microsoft.SkipLayerNormalization</a>
* <a href="#com.microsoft.SkipSimplifiedLayerNormalization">com.microsoft.SkipSimplifiedLayerNormalization</a>
* <a href="#com.microsoft.Snpe">com.microsoft.Snpe</a>
Expand Down Expand Up @@ -2342,7 +2343,7 @@ This version of the operator has been available since version 1 of the 'com.micr

<dl>
<dt><tt>activation</tt> : int (required)</dt>
<dd>Activation after group normalization: 0 for None, 1 for Swish</dd>
<dd>Activation after group normalization: 0 for None, 1 for SiLU</dd>
<dt><tt>channels_last</tt> : int</dt>
<dd>1 if the input and output are in the NHWC layout, 0 if it is in the NCHW layout. Defaults to 1.</dd>
<dt><tt>epsilon</tt> : float</dt>
Expand Down Expand Up @@ -2582,6 +2583,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Input B is stored as uint8_t with shape: [(N * K + 1) / 2].
Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size].


#### Version

This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
Expand Down Expand Up @@ -5083,6 +5085,72 @@ This version of the operator has been available since version 1 of the 'com.micr
</dl>


### <a name="com.microsoft.SkipGroupNorm"></a><a name="com.microsoft.skipgroupnorm">**com.microsoft.SkipGroupNorm**</a>

This operator element-wise adds x, skip and bias, then apply group normalization and optional activation.

This operator transforms input according to
s = x + skip + bias
y = gamma * (s - mean) / sqrt(variance + epsilon) + beta

The input channels are separated into num_groups groups, each containing num_channels / num_groups channels.
The num_channels must be divisible by num_groups.
The mean and standard-deviation of s are calculated separately over the each group.
The weight and bias are per-channel affine transform parameter vectors of size num_channels.

The activation attribute can be used to enable activation after group normalization.

#### Version

This version of the operator has been available since version 1 of the 'com.microsoft' operator set.

#### Attributes

<dl>
<dt><tt>activation</tt> : int (required)</dt>
<dd>Activation after group normalization: 0 for None, 1 for SiLU</dd>
<dt><tt>channels_last</tt> : int</dt>
<dd>1 if the input and output are in the NHWC layout, 0 if it is in the NCHW layout. Defaults to 1.</dd>
<dt><tt>epsilon</tt> : float</dt>
<dd>The epsilon value to use to avoid division by zero</dd>
<dt><tt>groups</tt> : int (required)</dt>
<dd>The number of groups of channels. It should be a divisor of the number of channels C</dd>
</dl>

#### Inputs (4 - 5)

<dl>
<dt><tt>X</tt> : T</dt>
<dd>Input data tensor. Dimensions are (N x H x W x C) when channels_last is 1 or (N x C x H x W) otherwise, where N is the batch size, C is the number of channels, and H and W are the height and width of the data</dd>
<dt><tt>gamma</tt> : M</dt>
<dd>1D gamma tensor for normalization with shape (C), where C is number of channels</dd>
<dt><tt>beta</tt> : M</dt>
<dd>1D beta tensor for normalization with shape (C), where C is number of channels</dd>
<dt><tt>skip</tt> : T</dt>
<dd>4D or 2D skip tensor. The shape can be (N x H x W x C) or (N x 1 x 1 x C) or (N x C)</dd>
<dt><tt>bias</tt> (optional) : T</dt>
<dd>1D bias tensor. Dimensions are (C), where C is number of channels</dd>
</dl>

#### Outputs (1 - 2)

<dl>
<dt><tt>Y</tt> : T</dt>
<dd>The output tensor of the same shape as X</dd>
<dt><tt>S</tt> (optional) : T</dt>
<dd>The element-wise sum of input x, skip and bias tensors. It has the same shape as X</dd>
</dl>

#### Type Constraints

<dl>
<dt><tt>T</tt> : tensor(float16), tensor(float)</dt>
<dd>Constrain input X, skip, bias and output Y, S types to float tensors.</dd>
<dt><tt>M</tt> : tensor(float16), tensor(float)</dt>
<dd>Constrain gamma and beta to float tensors.</dd>
</dl>


### <a name="com.microsoft.SkipLayerNormalization"></a><a name="com.microsoft.skiplayernormalization">**com.microsoft.SkipLayerNormalization**</a>

Skip and Layer Normalization Fusion
Expand Down
1 change: 1 addition & 0 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,7 @@ Do not modify directly.*
|Rfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
|Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipGroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *in* skip:**T**<br> *in* bias:**T**<br> *out* Y:**T**<br> *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
Expand Down
2 changes: 0 additions & 2 deletions docs/python/ReadMeOV.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ OpenVINO™ Execution Provider for ONNX Runtime accelerates inference across man
- Intel® CPUs
- Intel® integrated GPUs
- Intel® discrete GPUs
- Intel® integrated VPUs

Installation
------------
Expand All @@ -22,7 +21,6 @@ This package supports:
- Intel® CPUs
- Intel® integrated GPUs
- Intel® discrete GPUs
- Intel® integrated VPUs

``pip3 install onnxruntime-openvino``

Expand Down
8 changes: 6 additions & 2 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ typedef struct OrtMIGraphXProviderOptions {
typedef struct OrtOpenVINOProviderOptions {
#ifdef __cplusplus
OrtOpenVINOProviderOptions() : device_type{},
enable_vpu_fast_compile{},
enable_npu_fast_compile{},
device_id{},
num_of_threads{},
cache_dir{},
Expand All @@ -624,7 +624,7 @@ typedef struct OrtOpenVINOProviderOptions {
* Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
*/
const char* device_type;
unsigned char enable_vpu_fast_compile; ///< 0 = disabled, nonzero = enabled
unsigned char enable_npu_fast_compile; ///< 0 = disabled, nonzero = enabled
const char* device_id;
size_t num_of_threads; ///< 0 = Use default number of threads
const char* cache_dir; // path is set to empty by default
Expand Down Expand Up @@ -4605,6 +4605,10 @@ struct OrtCustomOp {
OrtStatusPtr(ORT_API_CALL* KernelComputeV2)(_In_ void* op_kernel, _In_ OrtKernelContext* context);

OrtStatusPtr(ORT_API_CALL* InferOutputShapeFn)(_In_ const struct OrtCustomOp* op, _In_ OrtShapeInferContext*);

// Get start range
int(ORT_API_CALL* GetStartVersion)(_In_ const struct OrtCustomOp* op);
int(ORT_API_CALL* GetEndVersion)(_In_ const struct OrtCustomOp* op);
};

/*
Expand Down
13 changes: 13 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -2228,6 +2228,8 @@ struct ShapeInferContext {

using ShapeInferFn = Ort::Status (*)(Ort::ShapeInferContext&);

#define MAX_CUSTOM_OP_END_VER (1UL << 31) - 1

template <typename TOp, typename TKernel, bool WithStatus = false>
struct CustomOpBase : OrtCustomOp {
CustomOpBase() {
Expand Down Expand Up @@ -2280,6 +2282,14 @@ struct CustomOpBase : OrtCustomOp {
}

SetShapeInferFn<TOp>(0);

OrtCustomOp::GetStartVersion = [](const OrtCustomOp* this_) {
return static_cast<const TOp*>(this_)->start_ver_;
};

OrtCustomOp::GetEndVersion = [](const OrtCustomOp* this_) {
return static_cast<const TOp*>(this_)->end_ver_;
};
}

// Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
Expand Down Expand Up @@ -2348,6 +2358,9 @@ struct CustomOpBase : OrtCustomOp {
protected:
// Helper function that returns a map of session config entries specified by CustomOpBase::GetSessionConfigKeys.
void GetSessionConfigs(std::unordered_map<std::string, std::string>& out, ConstSessionOptions options) const;

int start_ver_ = 1;
int end_ver_ = MAX_CUSTOM_OP_END_VER;
};

} // namespace Ort
Expand Down
59 changes: 43 additions & 16 deletions include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -773,8 +773,11 @@ struct OrtLiteCustomOp : public OrtCustomOp {
PARSE_ARGS(Ort::Float8E5M2FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ)

OrtLiteCustomOp(const char* op_name,
const char* execution_provider) : op_name_(op_name),
execution_provider_(execution_provider) {
const char* execution_provider,
int start_ver = 1, int end_ver = MAX_CUSTOM_OP_END_VER) : op_name_(op_name),
execution_provider_(execution_provider),
start_ver_(start_ver),
end_ver_(end_ver) {
OrtCustomOp::version = ORT_API_VERSION;

OrtCustomOp::GetName = [](const OrtCustomOp* op) { return static_cast<const OrtLiteCustomOp*>(op)->op_name_.c_str(); };
Expand Down Expand Up @@ -837,13 +840,26 @@ struct OrtLiteCustomOp : public OrtCustomOp {
OrtCustomOp::KernelCompute = {};

OrtCustomOp::InferOutputShapeFn = {};

OrtCustomOp::GetStartVersion = [](const OrtCustomOp* op) {
auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
return self->start_ver_;
};

OrtCustomOp::GetEndVersion = [](const OrtCustomOp* op) {
auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
return self->end_ver_;
};
}

const std::string op_name_;
const std::string execution_provider_;

std::vector<ONNXTensorElementDataType> input_types_;
std::vector<ONNXTensorElementDataType> output_types_;

int start_ver_ = 1;
int end_ver_ = MAX_CUSTOM_OP_END_VER;
};

//////////////////////////// OrtLiteCustomFunc ////////////////////////////////
Expand Down Expand Up @@ -873,9 +889,11 @@ struct OrtLiteCustomFunc : public OrtLiteCustomOp {
OrtLiteCustomFunc(const char* op_name,
const char* execution_provider,
ComputeFn compute_fn,
ShapeInferFn shape_infer_fn = {}) : OrtLiteCustomOp(op_name, execution_provider),
compute_fn_(compute_fn),
shape_infer_fn_(shape_infer_fn) {
ShapeInferFn shape_infer_fn = {},
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver),
compute_fn_(compute_fn),
shape_infer_fn_(shape_infer_fn) {
ParseArgs<Args...>(input_types_, output_types_);

OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
Expand Down Expand Up @@ -911,9 +929,11 @@ struct OrtLiteCustomFunc : public OrtLiteCustomOp {
OrtLiteCustomFunc(const char* op_name,
const char* execution_provider,
ComputeFnReturnStatus compute_fn_return_status,
ShapeInferFn shape_infer_fn = {}) : OrtLiteCustomOp(op_name, execution_provider),
compute_fn_return_status_(compute_fn_return_status),
shape_infer_fn_(shape_infer_fn) {
ShapeInferFn shape_infer_fn = {},
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver),
compute_fn_return_status_(compute_fn_return_status),
shape_infer_fn_(shape_infer_fn) {
ParseArgs<Args...>(input_types_, output_types_);

OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
Expand Down Expand Up @@ -985,8 +1005,9 @@ struct OrtLiteCustomStruct : public OrtLiteCustomOp {
};

OrtLiteCustomStruct(const char* op_name,
const char* execution_provider) : OrtLiteCustomOp(op_name,
execution_provider) {
const char* execution_provider,
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver) {
SetCompute(&CustomOp::Compute);

OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
Expand Down Expand Up @@ -1049,25 +1070,31 @@ template <typename... Args>
OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
const char* execution_provider,
void (*custom_compute_fn)(Args...),
Status (*shape_infer_fn)(ShapeInferContext&) = {}) {
Status (*shape_infer_fn)(ShapeInferContext&) = {},
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) {
using LiteOp = OrtLiteCustomFunc<Args...>;
return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn).release();
return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn, start_ver, end_ver).release();
}

template <typename... Args>
OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
const char* execution_provider,
Status (*custom_compute_fn_v2)(Args...),
Status (*shape_infer_fn)(ShapeInferContext&) = {}) {
Status (*shape_infer_fn)(ShapeInferContext&) = {},
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) {
using LiteOp = OrtLiteCustomFunc<Args...>;
return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn).release();
return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn, start_ver, end_ver).release();
}

template <typename CustomOp>
OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
const char* execution_provider) {
const char* execution_provider,
int start_ver = 1,
int end_ver = MAX_CUSTOM_OP_END_VER) {
using LiteOp = OrtLiteCustomStruct<CustomOp>;
return std::make_unique<LiteOp>(op_name, execution_provider).release();
return std::make_unique<LiteOp>(op_name, execution_provider, start_ver, end_ver).release();
}

} // namespace Custom
Expand Down
Loading

0 comments on commit 76e7c79

Please sign in to comment.