From f69e4bae919913be9cc0aff6265872fc346d94f8 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Wed, 24 Jul 2024 22:13:27 +0000 Subject: [PATCH 1/8] Enable FP16 Clip and Fix Bias Issue in FP16 Depthwise Conv --- onnxruntime/core/mlas/inc/mlas.h | 1 + onnxruntime/core/mlas/lib/dwconv.cpp | 12 +- .../core/providers/cpu/fp16/fp16_conv.cc | 4 +- onnxruntime/core/providers/cpu/math/clip.cc | 2 +- .../test/providers/cpu/nn/conv_fp16_test.cc | 178 ++++++++++++++++++ .../test/providers/cpu/nn/conv_op_test.cc | 158 ++++++++++++++++ 6 files changed, 349 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 675f7c7a13e8c..2e6ef0287cf11 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1762,6 +1762,7 @@ MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 15511d2d8ceac..19951a629f847 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -24,6 +24,7 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, @@ -36,7 +37,7 @@ MlasConvDepthwiseKernel( size_t c = Channels; while (c >= 8) { - MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8(); + MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -54,7 +55,7 @@ MlasConvDepthwiseKernel( } if (c >= 4) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -72,7 +73,7 @@ MlasConvDepthwiseKernel( } if (c > 0) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -101,6 +102,7 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, @@ -110,7 +112,7 @@ MlasConvDepthwiseKernel( { while (OutputCount > 0) { for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) { - float Accumulator = 0.0f; + float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -136,6 +138,7 @@ MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, @@ -146,6 +149,7 @@ MlasConvDepthwise( MlasConvDepthwiseKernel( reinterpret_cast(Input), reinterpret_cast(Filter), + reinterpret_cast(Bias), reinterpret_cast<_mlas_fp16_*>(Output), Channels, OutputCount, diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc index e6867f10819ae..37db095e92570 100644 --- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc +++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc @@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr bool share_prepacked_weights = (prepacked_weights != nullptr); + const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1); // Don't pack the filter buffer if the MlasConvDepthwise path is used. - if (!(group_input_channels == 1 && group_output_channels == 1)) { + if (!is_depthwise_conv) { packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false); if (packed_W_size_ != 0) { size_t packed_W_data_size = SafeInt(group_count) * packed_W_size_; @@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const { MlasConvDepthwise( worker_indirection_buffer, reordered_W, + Bdata, worker_output, static_cast(M), static_cast(output_count), diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc index ddb64a5a0e461..b3a5cb209afc1 100644 --- a/onnxruntime/core/providers/cpu/math/clip.cc +++ b/onnxruntime/core/providers/cpu/math/clip.cc @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( float); ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0, - float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); + float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t, MLFloat16); } // namespace op_kernel_type_control using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST( diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index cb5fc8095982c..5a2660c05fe35 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -714,6 +714,184 @@ TEST(ConvFp16Test, Conv2D_group) { TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvFp16Test, Depthwise2D_Bias) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(2.0f), + MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), MLFloat16(8.0f), + + MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), + MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f), + MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f) + }; + vector X_shape = {1, 2, 3, 3}; + vector W = {MLFloat16(1.0f), MLFloat16(2.0f)}; + vector W_shape = {2, 1, 1, 1}; + vector B = {MLFloat16(1.0f), MLFloat16(-1.0f)}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = { + MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f), + MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f), + MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f), + + MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f), + MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f), + MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f) + }; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 13, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // C = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // C = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // C = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // C = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // C = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // C = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // C = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // C = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // C = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // C = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // C = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // C = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + }; + vector X_shape = {1, 13, 2, 2}; + vector W = { + // M = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // M = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // M = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // M = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // M = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // M = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // M = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // M = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // M = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // M = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // M = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // M = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // M = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + }; + vector W_shape = {13, 1, 2, 2}; + vector B = { + MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f), + MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), MLFloat16(12.0f), + MLFloat16(13.0f), + }; + vector B_shape = {13}; + vector Y_shape = {1, 13, 1, 1}; + auto expected_vals = { + MLFloat16(15.0f), MLFloat16(128.0f), MLFloat16(369.0f), MLFloat16(738.0f), MLFloat16(1235.0f), + MLFloat16(1860.0f), MLFloat16(2613.0f), MLFloat16(3494.0f), MLFloat16(4503.0f), MLFloat16(5640.0f), + MLFloat16(6905.0f), MLFloat16(8298.0f), MLFloat16(9819.0f), + }; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + + // NNAPI/CoreML EP requires weight to be an initializer + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvFp16Test, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 0efa78af2795c..3905019cf57d6 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -647,6 +647,164 @@ TEST(ConvTest, Conv2D_group) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvTest, Depthwise2D_Bias) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f}; + vector X_shape = {1, 2, 3, 3}; + vector W = {1.0f, 2.0f}; + vector W_shape = {2, 1, 1, 1}; + vector B = {1.0f, -1.0f}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 17.0f, 19.0f, 21.0f, 23.0f, 25.0f, 27.0f, 29.0f, 31.0f, 33.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + + // NNAPI/CoreML EP requires weight to be an initializer + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Complex) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 13, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // C = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // C = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // C = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // C = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // C = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // C = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // C = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // C = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // C = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // C = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // C = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // C = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + }; + vector X_shape = {1, 13, 2, 2}; + vector W = { + // M = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // M = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // M = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // M = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // M = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // M = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // M = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // M = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // M = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // M = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // M = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // M = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // M = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + }; + vector W_shape = {13, 1, 2, 2}; + vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f}; + vector B_shape = {13}; + vector Y_shape = {1, 13, 1, 1}; + auto expected_vals = { + 15.0f, 128.0f, 369.0f, 738.0f, 1235.0f, 1860.0f, 2613.0f, 3494.0f, 4503.0f, 5640.0f, 6905.0f, 8298.0f, 9819.0f + }; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + + // NNAPI/CoreML EP requires weight to be an initializer + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvTest, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad From a8706d471671f70fdf0baa7e11c311d05acfec01 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Wed, 24 Jul 2024 23:01:25 +0000 Subject: [PATCH 2/8] clang-format the code --- onnxruntime/core/mlas/lib/dwconv.cpp | 19 ++++------ .../test/providers/cpu/nn/conv_fp16_test.cc | 38 ++++++++++++++----- .../test/providers/cpu/nn/conv_op_test.cc | 14 ++++++- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 19951a629f847..87effeac5fd9f 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -14,7 +14,6 @@ Module Name: --*/ - #include "fp16_common.h" #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -30,7 +29,7 @@ MlasConvDepthwiseKernel( size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { size_t ChannelOffset = 0; @@ -87,8 +86,7 @@ MlasConvDepthwiseKernel( Output += c; } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; @@ -108,7 +106,7 @@ MlasConvDepthwiseKernel( size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) { @@ -122,16 +120,14 @@ MlasConvDepthwiseKernel( *Output++ = MLAS_Float2Half(Accumulator); } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; } } -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED - +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED void MLASCALL @@ -144,7 +140,7 @@ MlasConvDepthwise( size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { MlasConvDepthwiseKernel( reinterpret_cast(Input), @@ -154,5 +150,6 @@ MlasConvDepthwise( Channels, OutputCount, KernelSize, - PostProc); + PostProc + ); } diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index 5a2660c05fe35..f723bd10ec581 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -732,8 +732,7 @@ TEST(ConvFp16Test, Depthwise2D_Bias) { MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f), - MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f) - }; + MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f)}; vector X_shape = {1, 2, 3, 3}; vector W = {MLFloat16(1.0f), MLFloat16(2.0f)}; vector W_shape = {2, 1, 1, 1}; @@ -747,8 +746,7 @@ TEST(ConvFp16Test, Depthwise2D_Bias) { MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f), MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f), - MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f) - }; + MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f)}; TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); @@ -874,16 +872,36 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { }; vector W_shape = {13, 1, 2, 2}; vector B = { - MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f), - MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), MLFloat16(12.0f), + MLFloat16(1.0f), + MLFloat16(2.0f), + MLFloat16(3.0f), + MLFloat16(4.0f), + MLFloat16(5.0f), + MLFloat16(6.0f), + MLFloat16(7.0f), + MLFloat16(8.0f), + MLFloat16(9.0f), + MLFloat16(10.0f), + MLFloat16(11.0f), + MLFloat16(12.0f), MLFloat16(13.0f), }; vector B_shape = {13}; vector Y_shape = {1, 13, 1, 1}; auto expected_vals = { - MLFloat16(15.0f), MLFloat16(128.0f), MLFloat16(369.0f), MLFloat16(738.0f), MLFloat16(1235.0f), - MLFloat16(1860.0f), MLFloat16(2613.0f), MLFloat16(3494.0f), MLFloat16(4503.0f), MLFloat16(5640.0f), - MLFloat16(6905.0f), MLFloat16(8298.0f), MLFloat16(9819.0f), + MLFloat16(15.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 + MLFloat16(128.0f), + MLFloat16(369.0f), + MLFloat16(738.0f), + MLFloat16(1235.0f), + MLFloat16(1860.0f), + MLFloat16(2613.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 7.0 + MLFloat16(3494.0f), + MLFloat16(4503.0f), + MLFloat16(5640.0f), + MLFloat16(6905.0f), + MLFloat16(8298.0f), + MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 }; TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); @@ -1252,4 +1270,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) { } // namespace test } // namespace onnxruntime -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED \ No newline at end of file +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 3905019cf57d6..8cf8c027bdde2 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -796,7 +796,19 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { vector B_shape = {13}; vector Y_shape = {1, 13, 1, 1}; auto expected_vals = { - 15.0f, 128.0f, 369.0f, 738.0f, 1235.0f, 1860.0f, 2613.0f, 3494.0f, 4503.0f, 5640.0f, 6905.0f, 8298.0f, 9819.0f + 15.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 + 128.0f, + 369.0f, + 738.0f, + 1235.0f, + 1860.0f, + 2613.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 7.0 + 3494.0f, + 4503.0f, + 5640.0f, + 6905.0f, + 8298.0f, + 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 }; TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); From 647d2258f4001669c033b3de866c004c3bccaf38 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Thu, 25 Jul 2024 10:05:02 +0000 Subject: [PATCH 3/8] Add FP16/MLFloat16 Clip op test --- onnxruntime/core/providers/cpu/math/clip.cc | 2 +- .../test/providers/cpu/math/clip_test.cc | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc index b3a5cb209afc1..200469bc47835 100644 --- a/onnxruntime/core/providers/cpu/math/clip.cc +++ b/onnxruntime/core/providers/cpu/math/clip.cc @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( float); ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0, - float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t, MLFloat16); + float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); } // namespace op_kernel_type_control using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST( diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc index 6f81bbbe31d54..9948a6cc8a681 100644 --- a/onnxruntime/test/providers/cpu/math/clip_test.cc +++ b/onnxruntime/test/providers/cpu/math/clip_test.cc @@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(MathOpTest, Clip_MLFloat16) { + OpTester test("Clip", 12); + + std::vector dims{3, 3}; + test.AddInput("X", dims, + {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f), + MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)}); + test.AddInput("min", {}, {MLFloat16(0.0f)}); + test.AddInput("max", {}, {MLFloat16(6.0f)}); + test.AddOutput("Y", dims, + {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f), + MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)}); + + test.Run(); +} + TEST(MathOpTest, Clip_int32) { OpTester test("Clip", 12); From 26d31a02606e4e9deb3b783e0d55acbea2d6c068 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 03:58:11 +0000 Subject: [PATCH 4/8] Implement MlasLoadPartialFloat16x4 --- onnxruntime/core/mlas/lib/dwconv.cpp | 3 +- onnxruntime/core/mlas/lib/fp16_common.h | 17 ++++ .../test/providers/cpu/nn/conv_fp16_test.cc | 65 +++++++++++--- .../test/providers/cpu/nn/conv_op_test.cc | 84 +++++++++++++++---- 4 files changed, 138 insertions(+), 31 deletions(-) diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 87effeac5fd9f..d48d9cbb17502 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel( } if (c > 0) { - MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); + MLAS_FLOAT16X4 Accumulator = + Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index 1fcab870af64f..4ed186fcd0cfb 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -64,6 +64,23 @@ MLAS_FORCEINLINE MLAS_FLOAT16X4 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); } +MLAS_FORCEINLINE +MLAS_FLOAT16X4 +MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len) +{ + MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4(); + if ((len & 1) != 0) { + Vector = vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0); + } + if ((len & 2) != 0) { + Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0)); + Vector = vreinterpret_f16_f32( + vld1_lane_f32(reinterpret_cast(Buffer), vreinterpret_f32_f16(Vector), 0) + ); + } + return Vector; +} + MLAS_FORCEINLINE void MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector) diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index f723bd10ec581..b1bbe9b865178 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -714,7 +714,31 @@ TEST(ConvFp16Test, Conv2D_group) { TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } -TEST(ConvFp16Test, Depthwise2D_Bias) { +TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {MLFloat16(1.0f)}; + vector X_shape = {1, 1, 1, 1}; + vector W = {MLFloat16(0.5f)}; + vector W_shape = {1, 1, 1, 1}; + vector B = {MLFloat16(0.5f)}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {MLFloat16(1.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group2) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -752,11 +776,11 @@ TEST(ConvFp16Test, Depthwise2D_Bias) { TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } -TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { +TEST(ConvFp16Test, Depthwise2D_Bias_Group15) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations - 13, // group + 15, // group vector{2, 2}, // kernel_shape vector{0, 0, 0, 0}, // pads vector{1, 1}, // strides @@ -815,8 +839,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { // C = 12 MLFloat16(48.0f), MLFloat16(49.0f), MLFloat16(50.0f), MLFloat16(51.0f), - }; - vector X_shape = {1, 13, 2, 2}; + + // C = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // C = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector X_shape = {1, 15, 2, 2}; vector W = { // M = 0 MLFloat16(0.0f), MLFloat16(1.0f), @@ -869,8 +900,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { // M = 12 MLFloat16(48.0f), MLFloat16(49.0f), MLFloat16(50.0f), MLFloat16(51.0f), - }; - vector W_shape = {13, 1, 2, 2}; + + // M = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // M = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector W_shape = {15, 1, 2, 2}; vector B = { MLFloat16(1.0f), MLFloat16(2.0f), @@ -885,9 +923,10 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { MLFloat16(11.0f), MLFloat16(12.0f), MLFloat16(13.0f), - }; - vector B_shape = {13}; - vector Y_shape = {1, 13, 1, 1}; + MLFloat16(14.0f), + MLFloat16(15.0f)}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { MLFloat16(15.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 MLFloat16(128.0f), @@ -901,12 +940,12 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { MLFloat16(5640.0f), MLFloat16(6905.0f), MLFloat16(8298.0f), - MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + MLFloat16(11468.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 + MLFloat16(13245.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 }; TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); - - // NNAPI/CoreML EP requires weight to be an initializer TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 8cf8c027bdde2..418f9bd7794ac 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -647,7 +647,31 @@ TEST(ConvTest, Conv2D_group) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } -TEST(ConvTest, Depthwise2D_Bias) { +TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f}; + vector X_shape = {1, 1, 1, 1}; + vector W = {0.5f}; + vector W_shape = {1, 1, 1, 1}; + vector B = {0.5f}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {1.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group2) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -658,26 +682,38 @@ TEST(ConvTest, Depthwise2D_Bias) { {} // excluded EPs }; - vector X = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f}; + vector X = { + 0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, + + 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, + 15.0f, 16.0f, 17.0f}; vector X_shape = {1, 2, 3, 3}; vector W = {1.0f, 2.0f}; vector W_shape = {2, 1, 1, 1}; vector B = {1.0f, -1.0f}; vector B_shape = {2}; vector Y_shape = {1, 2, 3, 3}; - auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 17.0f, 19.0f, 21.0f, 23.0f, 25.0f, 27.0f, 29.0f, 31.0f, 33.0f}; + auto expected_vals = { + 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, + 7.0f, 8.0f, 9.0f, - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + 17.0f, 19.0f, 21.0f, + 23.0f, 25.0f, 27.0f, + 29.0f, 31.0f, 33.0f}; - // NNAPI/CoreML EP requires weight to be an initializer + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } -TEST(ConvTest, Depthwise2D_Bias_Complex) { +TEST(ConvTest, Depthwise2D_Bias_Group15) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations - 13, // group + 15, // group vector{2, 2}, // kernel_shape vector{0, 0, 0, 0}, // pads vector{1, 1}, // strides @@ -736,8 +772,15 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { // C = 12 48.0f, 49.0f, 50.0f, 51.0f, - }; - vector X_shape = {1, 13, 2, 2}; + + // C = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // C = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector X_shape = {1, 15, 2, 2}; vector W = { // M = 0 0.0f, 1.0f, @@ -790,11 +833,18 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { // M = 12 48.0f, 49.0f, 50.0f, 51.0f, - }; - vector W_shape = {13, 1, 2, 2}; - vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f}; - vector B_shape = {13}; - vector Y_shape = {1, 13, 1, 1}; + + // M = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // M = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector W_shape = {15, 1, 2, 2}; + vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { 15.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 128.0f, @@ -808,12 +858,12 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { 5640.0f, 6905.0f, 8298.0f, - 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + 11468.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 + 13245.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 }; TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); - - // NNAPI/CoreML EP requires weight to be an initializer TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } From 3df8960cb2498edbda90326ded29208e5b6396ec Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 05:17:37 +0000 Subject: [PATCH 5/8] Fix uint16x4_t to MLAS_FLOAT16X4 conversion error --- onnxruntime/core/mlas/lib/fp16_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index 4ed186fcd0cfb..30b66cdb2ea78 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -70,7 +70,7 @@ MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len) { MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4(); if ((len & 1) != 0) { - Vector = vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0); + Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0)); } if ((len & 2) != 0) { Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0)); From 7725154c33bea43e3fe04de07ed89516332cf577 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 05:26:53 +0000 Subject: [PATCH 6/8] Add comment for MlasConvDepthwise new parameter --- onnxruntime/core/mlas/inc/mlas.h | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 2e6ef0287cf11..e46105324a7fb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac * @brief Indirect Depthwise convolution for fp16 * @param Input Supplies the indirect buffer for NHWC input * @param Filter Supplies the address for filter tensor + * @param Bias Supplies the address for 1D bias tensor B, has size of M * @param Output Supplies the address for the result tensor * @param Channels # of input channels * @param OutputCount # of output pixels From f479051b065663b21852b5b31a54c0d9241aef56 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 10:40:13 +0000 Subject: [PATCH 7/8] Update docs/OperatorKernels.md --- docs/OperatorKernels.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index ed944b5a6df79..be3ca050eca9e 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -58,8 +58,8 @@ Do not modify directly.* |Ceil|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float)| |||[6, 12]|**T** = tensor(double), tensor(float)| |Celu|*in* X:**T**
*out* Y:**T**|12+|**T** = tensor(float)| -|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| -|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| |||11|**T** = tensor(float)| |||[6, 10]|**T** = tensor(float)| |Col2Im|*in* input:**T**
*in* image_shape:**tensor(int64)**
*in* block_shape:**tensor(int64)**
*out* output:**T**|18+|**T** = tensor(float)| From 8fbe6dfb10a0760ffd3eac9d8bde34ff568b297c Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 22:57:11 +0000 Subject: [PATCH 8/8] Amend tests so bias cannot be ignored --- .../test/providers/cpu/nn/conv_fp16_test.cc | 60 +++++++++---------- .../test/providers/cpu/nn/conv_op_test.cc | 47 ++++++++++----- 2 files changed, 61 insertions(+), 46 deletions(-) diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index b1bbe9b865178..95b274966fbbb 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -910,39 +910,39 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Group15) { MLFloat16(58.0f), MLFloat16(59.0f)}; vector W_shape = {15, 1, 2, 2}; vector B = { - MLFloat16(1.0f), - MLFloat16(2.0f), - MLFloat16(3.0f), - MLFloat16(4.0f), - MLFloat16(5.0f), - MLFloat16(6.0f), - MLFloat16(7.0f), - MLFloat16(8.0f), - MLFloat16(9.0f), - MLFloat16(10.0f), - MLFloat16(11.0f), - MLFloat16(12.0f), - MLFloat16(13.0f), - MLFloat16(14.0f), - MLFloat16(15.0f)}; + MLFloat16(101.0f), + MLFloat16(102.0f), + MLFloat16(103.0f), + MLFloat16(104.0f), + MLFloat16(105.0f), + MLFloat16(106.0f), + MLFloat16(107.0f), + MLFloat16(108.0f), + MLFloat16(109.0f), + MLFloat16(110.0f), + MLFloat16(111.0f), + MLFloat16(112.0f), + MLFloat16(113.0f), + MLFloat16(114.0f), + MLFloat16(115.0f)}; vector B_shape = {15}; vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { - MLFloat16(15.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 - MLFloat16(128.0f), - MLFloat16(369.0f), - MLFloat16(738.0f), - MLFloat16(1235.0f), - MLFloat16(1860.0f), - MLFloat16(2613.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 7.0 - MLFloat16(3494.0f), - MLFloat16(4503.0f), - MLFloat16(5640.0f), - MLFloat16(6905.0f), - MLFloat16(8298.0f), - MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 - MLFloat16(11468.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 - MLFloat16(13245.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 + MLFloat16(115.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + MLFloat16(228.0f), + MLFloat16(469.0f), + MLFloat16(838.0f), + MLFloat16(1335.0f), + MLFloat16(1960.0f), + MLFloat16(2713.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + MLFloat16(3594.0f), + MLFloat16(4603.0f), + MLFloat16(5740.0f), + MLFloat16(7005.0f), + MLFloat16(8398.0f), + MLFloat16(9919.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + MLFloat16(11568.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + MLFloat16(13345.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 }; TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 418f9bd7794ac..2d885ee9d479f 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -842,25 +842,40 @@ TEST(ConvTest, Depthwise2D_Bias_Group15) { 56.0f, 57.0f, 58.0f, 59.0f}; vector W_shape = {15, 1, 2, 2}; - vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + vector B = { + 101.0f, + 102.0f, + 103.0f, + 104.0f, + 105.0f, + 106.0f, + 107.0f, + 108.0f, + 109.0f, + 110.0f, + 111.0f, + 112.0f, + 113.0f, + 114.0f, + 115.0f}; vector B_shape = {15}; vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { - 15.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 - 128.0f, - 369.0f, - 738.0f, - 1235.0f, - 1860.0f, - 2613.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 7.0 - 3494.0f, - 4503.0f, - 5640.0f, - 6905.0f, - 8298.0f, - 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 - 11468.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 - 13245.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 + 115.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + 228.0f, + 469.0f, + 838.0f, + 1335.0f, + 1960.0f, + 2713.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + 3594.0f, + 4603.0f, + 5740.0f, + 7005.0f, + 8398.0f, + 9919.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + 11568.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + 13345.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 }; TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);