diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index ed944b5a6df79..be3ca050eca9e 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -58,8 +58,8 @@ Do not modify directly.* |Ceil|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float)| |||[6, 12]|**T** = tensor(double), tensor(float)| |Celu|*in* X:**T**
*out* Y:**T**|12+|**T** = tensor(float)| -|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| -|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| |||11|**T** = tensor(float)| |||[6, 10]|**T** = tensor(float)| |Col2Im|*in* input:**T**
*in* image_shape:**tensor(int64)**
*in* block_shape:**tensor(int64)**
*out* output:**T**|18+|**T** = tensor(float)| diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 675f7c7a13e8c..e46105324a7fb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac * @brief Indirect Depthwise convolution for fp16 * @param Input Supplies the indirect buffer for NHWC input * @param Filter Supplies the address for filter tensor + * @param Bias Supplies the address for 1D bias tensor B, has size of M * @param Output Supplies the address for the result tensor * @param Channels # of input channels * @param OutputCount # of output pixels @@ -1762,6 +1763,7 @@ MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 15511d2d8ceac..d48d9cbb17502 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -14,7 +14,6 @@ Module Name: --*/ - #include "fp16_common.h" #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -24,19 +23,20 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { size_t ChannelOffset = 0; size_t c = Channels; while (c >= 8) { - MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8(); + MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -54,7 +54,7 @@ MlasConvDepthwiseKernel( } if (c >= 4) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel( } if (c > 0) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = + Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -86,8 +87,7 @@ MlasConvDepthwiseKernel( Output += c; } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; @@ -101,16 +101,17 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) { - float Accumulator = 0.0f; + float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -120,35 +121,36 @@ MlasConvDepthwiseKernel( *Output++ = MLAS_Float2Half(Accumulator); } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; } } -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED - +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED void MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { MlasConvDepthwiseKernel( reinterpret_cast(Input), reinterpret_cast(Filter), + reinterpret_cast(Bias), reinterpret_cast<_mlas_fp16_*>(Output), Channels, OutputCount, KernelSize, - PostProc); + PostProc + ); } diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index 1fcab870af64f..30b66cdb2ea78 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -64,6 +64,23 @@ MLAS_FORCEINLINE MLAS_FLOAT16X4 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); } +MLAS_FORCEINLINE +MLAS_FLOAT16X4 +MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len) +{ + MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4(); + if ((len & 1) != 0) { + Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0)); + } + if ((len & 2) != 0) { + Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0)); + Vector = vreinterpret_f16_f32( + vld1_lane_f32(reinterpret_cast(Buffer), vreinterpret_f32_f16(Vector), 0) + ); + } + return Vector; +} + MLAS_FORCEINLINE void MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector) diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc index e6867f10819ae..37db095e92570 100644 --- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc +++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc @@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr bool share_prepacked_weights = (prepacked_weights != nullptr); + const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1); // Don't pack the filter buffer if the MlasConvDepthwise path is used. - if (!(group_input_channels == 1 && group_output_channels == 1)) { + if (!is_depthwise_conv) { packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false); if (packed_W_size_ != 0) { size_t packed_W_data_size = SafeInt(group_count) * packed_W_size_; @@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const { MlasConvDepthwise( worker_indirection_buffer, reordered_W, + Bdata, worker_output, static_cast(M), static_cast(output_count), diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc index ddb64a5a0e461..200469bc47835 100644 --- a/onnxruntime/core/providers/cpu/math/clip.cc +++ b/onnxruntime/core/providers/cpu/math/clip.cc @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( float); ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0, - float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); + float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); } // namespace op_kernel_type_control using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST( diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc index 6f81bbbe31d54..9948a6cc8a681 100644 --- a/onnxruntime/test/providers/cpu/math/clip_test.cc +++ b/onnxruntime/test/providers/cpu/math/clip_test.cc @@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(MathOpTest, Clip_MLFloat16) { + OpTester test("Clip", 12); + + std::vector dims{3, 3}; + test.AddInput("X", dims, + {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f), + MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)}); + test.AddInput("min", {}, {MLFloat16(0.0f)}); + test.AddInput("max", {}, {MLFloat16(6.0f)}); + test.AddOutput("Y", dims, + {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f), + MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)}); + + test.Run(); +} + TEST(MathOpTest, Clip_int32) { OpTester test("Clip", 12); diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index cb5fc8095982c..95b274966fbbb 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -714,6 +714,241 @@ TEST(ConvFp16Test, Conv2D_group) { TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {MLFloat16(1.0f)}; + vector X_shape = {1, 1, 1, 1}; + vector W = {MLFloat16(0.5f)}; + vector W_shape = {1, 1, 1, 1}; + vector B = {MLFloat16(0.5f)}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {MLFloat16(1.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group2) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(2.0f), + MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), MLFloat16(8.0f), + + MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), + MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f), + MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f)}; + vector X_shape = {1, 2, 3, 3}; + vector W = {MLFloat16(1.0f), MLFloat16(2.0f)}; + vector W_shape = {2, 1, 1, 1}; + vector B = {MLFloat16(1.0f), MLFloat16(-1.0f)}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = { + MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f), + MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f), + MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f), + + MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f), + MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f), + MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group15) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 15, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // C = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // C = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // C = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // C = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // C = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // C = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // C = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // C = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // C = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // C = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // C = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // C = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + + // C = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // C = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector X_shape = {1, 15, 2, 2}; + vector W = { + // M = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // M = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // M = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // M = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // M = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // M = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // M = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // M = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // M = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // M = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // M = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // M = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // M = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + + // M = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // M = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector W_shape = {15, 1, 2, 2}; + vector B = { + MLFloat16(101.0f), + MLFloat16(102.0f), + MLFloat16(103.0f), + MLFloat16(104.0f), + MLFloat16(105.0f), + MLFloat16(106.0f), + MLFloat16(107.0f), + MLFloat16(108.0f), + MLFloat16(109.0f), + MLFloat16(110.0f), + MLFloat16(111.0f), + MLFloat16(112.0f), + MLFloat16(113.0f), + MLFloat16(114.0f), + MLFloat16(115.0f)}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; + auto expected_vals = { + MLFloat16(115.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + MLFloat16(228.0f), + MLFloat16(469.0f), + MLFloat16(838.0f), + MLFloat16(1335.0f), + MLFloat16(1960.0f), + MLFloat16(2713.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + MLFloat16(3594.0f), + MLFloat16(4603.0f), + MLFloat16(5740.0f), + MLFloat16(7005.0f), + MLFloat16(8398.0f), + MLFloat16(9919.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + MLFloat16(11568.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + MLFloat16(13345.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 + }; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvFp16Test, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad @@ -1074,4 +1309,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) { } // namespace test } // namespace onnxruntime -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED \ No newline at end of file +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 0efa78af2795c..2d885ee9d479f 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -647,6 +647,241 @@ TEST(ConvTest, Conv2D_group) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f}; + vector X_shape = {1, 1, 1, 1}; + vector W = {0.5f}; + vector W_shape = {1, 1, 1, 1}; + vector B = {0.5f}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {1.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group2) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + 0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, + + 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, + 15.0f, 16.0f, 17.0f}; + vector X_shape = {1, 2, 3, 3}; + vector W = {1.0f, 2.0f}; + vector W_shape = {2, 1, 1, 1}; + vector B = {1.0f, -1.0f}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = { + 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, + 7.0f, 8.0f, 9.0f, + + 17.0f, 19.0f, 21.0f, + 23.0f, 25.0f, 27.0f, + 29.0f, 31.0f, 33.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group15) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 15, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // C = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // C = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // C = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // C = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // C = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // C = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // C = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // C = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // C = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // C = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // C = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // C = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + + // C = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // C = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector X_shape = {1, 15, 2, 2}; + vector W = { + // M = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // M = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // M = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // M = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // M = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // M = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // M = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // M = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // M = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // M = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // M = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // M = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // M = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + + // M = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // M = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector W_shape = {15, 1, 2, 2}; + vector B = { + 101.0f, + 102.0f, + 103.0f, + 104.0f, + 105.0f, + 106.0f, + 107.0f, + 108.0f, + 109.0f, + 110.0f, + 111.0f, + 112.0f, + 113.0f, + 114.0f, + 115.0f}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; + auto expected_vals = { + 115.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + 228.0f, + 469.0f, + 838.0f, + 1335.0f, + 1960.0f, + 2713.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + 3594.0f, + 4603.0f, + 5740.0f, + 7005.0f, + 8398.0f, + 9919.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + 11568.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + 13345.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 + }; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvTest, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad