From 26d31a02606e4e9deb3b783e0d55acbea2d6c068 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Sat, 27 Jul 2024 03:58:11 +0000 Subject: [PATCH] Implement MlasLoadPartialFloat16x4 --- onnxruntime/core/mlas/lib/dwconv.cpp | 3 +- onnxruntime/core/mlas/lib/fp16_common.h | 17 ++++ .../test/providers/cpu/nn/conv_fp16_test.cc | 65 +++++++++++--- .../test/providers/cpu/nn/conv_op_test.cc | 84 +++++++++++++++---- 4 files changed, 138 insertions(+), 31 deletions(-) diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 87effeac5fd9f..d48d9cbb17502 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel( } if (c > 0) { - MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); + MLAS_FLOAT16X4 Accumulator = + Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index 1fcab870af64f..4ed186fcd0cfb 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -64,6 +64,23 @@ MLAS_FORCEINLINE MLAS_FLOAT16X4 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); } +MLAS_FORCEINLINE +MLAS_FLOAT16X4 +MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len) +{ + MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4(); + if ((len & 1) != 0) { + Vector = vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0); + } + if ((len & 2) != 0) { + Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0)); + Vector = vreinterpret_f16_f32( + vld1_lane_f32(reinterpret_cast(Buffer), vreinterpret_f32_f16(Vector), 0) + ); + } + return Vector; +} + MLAS_FORCEINLINE void MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector) diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index f723bd10ec581..b1bbe9b865178 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -714,7 +714,31 @@ TEST(ConvFp16Test, Conv2D_group) { TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } -TEST(ConvFp16Test, Depthwise2D_Bias) { +TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {MLFloat16(1.0f)}; + vector X_shape = {1, 1, 1, 1}; + vector W = {MLFloat16(0.5f)}; + vector W_shape = {1, 1, 1, 1}; + vector B = {MLFloat16(0.5f)}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {MLFloat16(1.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group2) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -752,11 +776,11 @@ TEST(ConvFp16Test, Depthwise2D_Bias) { TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } -TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { +TEST(ConvFp16Test, Depthwise2D_Bias_Group15) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations - 13, // group + 15, // group vector{2, 2}, // kernel_shape vector{0, 0, 0, 0}, // pads vector{1, 1}, // strides @@ -815,8 +839,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { // C = 12 MLFloat16(48.0f), MLFloat16(49.0f), MLFloat16(50.0f), MLFloat16(51.0f), - }; - vector X_shape = {1, 13, 2, 2}; + + // C = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // C = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector X_shape = {1, 15, 2, 2}; vector W = { // M = 0 MLFloat16(0.0f), MLFloat16(1.0f), @@ -869,8 +900,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { // M = 12 MLFloat16(48.0f), MLFloat16(49.0f), MLFloat16(50.0f), MLFloat16(51.0f), - }; - vector W_shape = {13, 1, 2, 2}; + + // M = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // M = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector W_shape = {15, 1, 2, 2}; vector B = { MLFloat16(1.0f), MLFloat16(2.0f), @@ -885,9 +923,10 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { MLFloat16(11.0f), MLFloat16(12.0f), MLFloat16(13.0f), - }; - vector B_shape = {13}; - vector Y_shape = {1, 13, 1, 1}; + MLFloat16(14.0f), + MLFloat16(15.0f)}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { MLFloat16(15.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 MLFloat16(128.0f), @@ -901,12 +940,12 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) { MLFloat16(5640.0f), MLFloat16(6905.0f), MLFloat16(8298.0f), - MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + MLFloat16(11468.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 + MLFloat16(13245.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 }; TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); - - // NNAPI/CoreML EP requires weight to be an initializer TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 8cf8c027bdde2..418f9bd7794ac 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -647,7 +647,31 @@ TEST(ConvTest, Conv2D_group) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } -TEST(ConvTest, Depthwise2D_Bias) { +TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f}; + vector X_shape = {1, 1, 1, 1}; + vector W = {0.5f}; + vector W_shape = {1, 1, 1, 1}; + vector B = {0.5f}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {1.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group2) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -658,26 +682,38 @@ TEST(ConvTest, Depthwise2D_Bias) { {} // excluded EPs }; - vector X = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f}; + vector X = { + 0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, + + 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, + 15.0f, 16.0f, 17.0f}; vector X_shape = {1, 2, 3, 3}; vector W = {1.0f, 2.0f}; vector W_shape = {2, 1, 1, 1}; vector B = {1.0f, -1.0f}; vector B_shape = {2}; vector Y_shape = {1, 2, 3, 3}; - auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 17.0f, 19.0f, 21.0f, 23.0f, 25.0f, 27.0f, 29.0f, 31.0f, 33.0f}; + auto expected_vals = { + 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, + 7.0f, 8.0f, 9.0f, - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + 17.0f, 19.0f, 21.0f, + 23.0f, 25.0f, 27.0f, + 29.0f, 31.0f, 33.0f}; - // NNAPI/CoreML EP requires weight to be an initializer + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } -TEST(ConvTest, Depthwise2D_Bias_Complex) { +TEST(ConvTest, Depthwise2D_Bias_Group15) { ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations - 13, // group + 15, // group vector{2, 2}, // kernel_shape vector{0, 0, 0, 0}, // pads vector{1, 1}, // strides @@ -736,8 +772,15 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { // C = 12 48.0f, 49.0f, 50.0f, 51.0f, - }; - vector X_shape = {1, 13, 2, 2}; + + // C = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // C = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector X_shape = {1, 15, 2, 2}; vector W = { // M = 0 0.0f, 1.0f, @@ -790,11 +833,18 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { // M = 12 48.0f, 49.0f, 50.0f, 51.0f, - }; - vector W_shape = {13, 1, 2, 2}; - vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f}; - vector B_shape = {13}; - vector Y_shape = {1, 13, 1, 1}; + + // M = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // M = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector W_shape = {15, 1, 2, 2}; + vector B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; auto expected_vals = { 15.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0 128.0f, @@ -808,12 +858,12 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) { 5640.0f, 6905.0f, 8298.0f, - 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + 9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0 + 11468.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0 + 13245.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0 }; TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); - - // NNAPI/CoreML EP requires weight to be an initializer TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); }