Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable FP16 Clip and Handle Bias in FP16 Depthwise Conv #21493

Merged
merged 8 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ Do not modify directly.*
|Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
|Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float)|
|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||11|**T** = tensor(float)|
|||[6, 10]|**T** = tensor(float)|
|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(float)|
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac
* @brief Indirect Depthwise convolution for fp16
* @param Input Supplies the indirect buffer for NHWC input
* @param Filter Supplies the address for filter tensor
* @param Bias Supplies the address for 1D bias tensor B, has size of M
* @param Output Supplies the address for the result tensor
* @param Channels # of input channels
* @param OutputCount # of output pixels
Expand All @@ -1762,6 +1763,7 @@ MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
const MLAS_FP16* Bias,
yihonglyu marked this conversation as resolved.
Show resolved Hide resolved
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
Expand Down
32 changes: 17 additions & 15 deletions onnxruntime/core/mlas/lib/dwconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Module Name:

--*/


#include "fp16_common.h"

#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
Expand All @@ -24,19 +23,20 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
while (OutputCount > 0) {
size_t ChannelOffset = 0;
size_t c = Channels;

while (c >= 8) {
MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -54,7 +54,7 @@ MlasConvDepthwiseKernel(
}

if (c >= 4) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
}

if (c > 0) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
MLAS_FLOAT16X4 Accumulator =
Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -86,8 +87,7 @@ MlasConvDepthwiseKernel(
Output += c;
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
Expand All @@ -101,16 +101,17 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
while (OutputCount > 0) {
for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
float Accumulator = 0.0f;
float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -120,35 +121,36 @@ MlasConvDepthwiseKernel(
*Output++ = MLAS_Float2Half(Accumulator);
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
}
}

#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED

#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED

void
MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
const MLAS_FP16* Bias,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
MlasConvDepthwiseKernel(
reinterpret_cast<const _mlas_fp16_* const*>(Input),
reinterpret_cast<const _mlas_fp16_*>(Filter),
reinterpret_cast<const _mlas_fp16_*>(Bias),
reinterpret_cast<_mlas_fp16_*>(Output),
Channels,
OutputCount,
KernelSize,
PostProc);
PostProc
);
}
17 changes: 17 additions & 0 deletions onnxruntime/core/mlas/lib/fp16_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }

MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
{
MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
if ((len & 1) != 0) {
Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0));
edgchen1 marked this conversation as resolved.
Show resolved Hide resolved
}
if ((len & 2) != 0) {
Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
Vector = vreinterpret_f16_f32(
vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
);
}
return Vector;
}

MLAS_FORCEINLINE
void
MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr

bool share_prepacked_weights = (prepacked_weights != nullptr);

const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
// Don't pack the filter buffer if the MlasConvDepthwise path is used.
if (!(group_input_channels == 1 && group_output_channels == 1)) {
if (!is_depthwise_conv) {
packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
if (packed_W_size_ != 0) {
size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
Expand Down Expand Up @@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
MlasConvDepthwise(
worker_indirection_buffer,
reordered_W,
Bdata,
worker_output,
static_cast<size_t>(M),
static_cast<size_t>(output_count),
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cpu/math/clip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
float);
ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0,
float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
} // namespace op_kernel_type_control

using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
Expand Down
18 changes: 18 additions & 0 deletions onnxruntime/test/providers/cpu/math/clip_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}

TEST(MathOpTest, Clip_MLFloat16) {
OpTester test("Clip", 12);

std::vector<int64_t> dims{3, 3};
test.AddInput<MLFloat16>("X", dims,
{MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f),
MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f),
MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)});
test.AddInput<MLFloat16>("min", {}, {MLFloat16(0.0f)});
test.AddInput<MLFloat16>("max", {}, {MLFloat16(6.0f)});
test.AddOutput<MLFloat16>("Y", dims,
{MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f),
MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f),
MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)});

test.Run();
}

TEST(MathOpTest, Clip_int32) {
OpTester test("Clip", 12);

Expand Down
Loading
Loading