diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index ed944b5a6df79..be3ca050eca9e 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -58,8 +58,8 @@ Do not modify directly.*
|Ceil|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
|Celu|*in* X:**T**
*out* Y:**T**|12+|**T** = tensor(float)|
-|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**
or
*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**
or
*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||11|**T** = tensor(float)|
|||[6, 10]|**T** = tensor(float)|
|Col2Im|*in* input:**T**
*in* image_shape:**tensor(int64)**
*in* block_shape:**tensor(int64)**
*out* output:**T**|18+|**T** = tensor(float)|
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 675f7c7a13e8c..e46105324a7fb 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac
* @brief Indirect Depthwise convolution for fp16
* @param Input Supplies the indirect buffer for NHWC input
* @param Filter Supplies the address for filter tensor
+ * @param Bias Supplies the address for 1D bias tensor B, has size of M
* @param Output Supplies the address for the result tensor
* @param Channels # of input channels
* @param OutputCount # of output pixels
@@ -1762,6 +1763,7 @@ MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
+ const MLAS_FP16* Bias,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp
index 15511d2d8ceac..d48d9cbb17502 100644
--- a/onnxruntime/core/mlas/lib/dwconv.cpp
+++ b/onnxruntime/core/mlas/lib/dwconv.cpp
@@ -14,7 +14,6 @@ Module Name:
--*/
-
#include "fp16_common.h"
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -24,19 +23,20 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
+ const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
- )
+)
{
while (OutputCount > 0) {
size_t ChannelOffset = 0;
size_t c = Channels;
while (c >= 8) {
- MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
+ MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
@@ -54,7 +54,7 @@ MlasConvDepthwiseKernel(
}
if (c >= 4) {
- MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+ MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
@@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
}
if (c > 0) {
- MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+ MLAS_FLOAT16X4 Accumulator =
+ Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
@@ -86,8 +87,7 @@ MlasConvDepthwiseKernel(
Output += c;
}
if (PostProc) {
- PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels,
- Channels);
+ PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
@@ -101,16 +101,17 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
+ const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
- )
+)
{
while (OutputCount > 0) {
for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
- float Accumulator = 0.0f;
+ float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
@@ -120,35 +121,36 @@ MlasConvDepthwiseKernel(
*Output++ = MLAS_Float2Half(Accumulator);
}
if (PostProc) {
- PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels,
- Channels);
+ PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
}
}
-#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
-
+#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
void
MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
+ const MLAS_FP16* Bias,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
- )
+)
{
MlasConvDepthwiseKernel(
reinterpret_cast(Input),
reinterpret_cast(Filter),
+ reinterpret_cast(Bias),
reinterpret_cast<_mlas_fp16_*>(Output),
Channels,
OutputCount,
KernelSize,
- PostProc);
+ PostProc
+ );
}
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
index 1fcab870af64f..30b66cdb2ea78 100644
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -64,6 +64,23 @@ MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }
+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
+{
+ MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
+ if ((len & 1) != 0) {
+ Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0));
+ }
+ if ((len & 2) != 0) {
+ Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
+ Vector = vreinterpret_f16_f32(
+ vld1_lane_f32(reinterpret_cast(Buffer), vreinterpret_f32_f16(Vector), 0)
+ );
+ }
+ return Vector;
+}
+
MLAS_FORCEINLINE
void
MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
index e6867f10819ae..37db095e92570 100644
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
bool share_prepacked_weights = (prepacked_weights != nullptr);
+ const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
// Don't pack the filter buffer if the MlasConvDepthwise path is used.
- if (!(group_input_channels == 1 && group_output_channels == 1)) {
+ if (!is_depthwise_conv) {
packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
if (packed_W_size_ != 0) {
size_t packed_W_data_size = SafeInt(group_count) * packed_W_size_;
@@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
MlasConvDepthwise(
worker_indirection_buffer,
reordered_W,
+ Bdata,
worker_output,
static_cast(M),
static_cast(output_count),
diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc
index ddb64a5a0e461..200469bc47835 100644
--- a/onnxruntime/core/providers/cpu/math/clip.cc
+++ b/onnxruntime/core/providers/cpu/math/clip.cc
@@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
float);
ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0,
- float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
+ float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
} // namespace op_kernel_type_control
using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index 6f81bbbe31d54..9948a6cc8a681 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}
+TEST(MathOpTest, Clip_MLFloat16) {
+ OpTester test("Clip", 12);
+
+ std::vector dims{3, 3};
+ test.AddInput("X", dims,
+ {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f),
+ MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+ MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)});
+ test.AddInput("min", {}, {MLFloat16(0.0f)});
+ test.AddInput("max", {}, {MLFloat16(6.0f)});
+ test.AddOutput("Y", dims,
+ {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f),
+ MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+ MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)});
+
+ test.Run();
+}
+
TEST(MathOpTest, Clip_int32) {
OpTester test("Clip", 12);
diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
index cb5fc8095982c..95b274966fbbb 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -714,6 +714,241 @@ TEST(ConvFp16Test, Conv2D_group) {
TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
}
+TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 1, // group
+ vector{1, 1}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {MLFloat16(1.0f)};
+ vector X_shape = {1, 1, 1, 1};
+ vector W = {MLFloat16(0.5f)};
+ vector W_shape = {1, 1, 1, 1};
+ vector B = {MLFloat16(0.5f)};
+ vector B_shape = {1};
+ vector Y_shape = {1, 1, 1, 1};
+ auto expected_vals = {MLFloat16(1.0f)};
+
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvFp16Test, Depthwise2D_Bias_Group2) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 2, // group
+ vector{1, 1}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {
+ MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(2.0f),
+ MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f),
+ MLFloat16(6.0f), MLFloat16(7.0f), MLFloat16(8.0f),
+
+ MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f),
+ MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f),
+ MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f)};
+ vector X_shape = {1, 2, 3, 3};
+ vector W = {MLFloat16(1.0f), MLFloat16(2.0f)};
+ vector W_shape = {2, 1, 1, 1};
+ vector B = {MLFloat16(1.0f), MLFloat16(-1.0f)};
+ vector B_shape = {2};
+ vector Y_shape = {1, 2, 3, 3};
+ auto expected_vals = {
+ MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f),
+ MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f),
+ MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f),
+
+ MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f),
+ MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f),
+ MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f)};
+
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvFp16Test, Depthwise2D_Bias_Group15) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 15, // group
+ vector{2, 2}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {
+ // C = 0
+ MLFloat16(0.0f), MLFloat16(1.0f),
+ MLFloat16(2.0f), MLFloat16(3.0f),
+
+ // C = 1
+ MLFloat16(4.0f), MLFloat16(5.0f),
+ MLFloat16(6.0f), MLFloat16(7.0f),
+
+ // C = 2
+ MLFloat16(8.0f), MLFloat16(9.0f),
+ MLFloat16(10.0f), MLFloat16(11.0f),
+
+ // C = 3
+ MLFloat16(12.0f), MLFloat16(13.0f),
+ MLFloat16(14.0f), MLFloat16(15.0f),
+
+ // C = 4
+ MLFloat16(16.0f), MLFloat16(17.0f),
+ MLFloat16(18.0f), MLFloat16(19.0f),
+
+ // C = 5
+ MLFloat16(20.0f), MLFloat16(21.0f),
+ MLFloat16(22.0f), MLFloat16(23.0f),
+
+ // C = 6
+ MLFloat16(24.0f), MLFloat16(25.0f),
+ MLFloat16(26.0f), MLFloat16(27.0f),
+
+ // C = 7
+ MLFloat16(28.0f), MLFloat16(29.0f),
+ MLFloat16(30.0f), MLFloat16(31.0f),
+
+ // C = 8
+ MLFloat16(32.0f), MLFloat16(33.0f),
+ MLFloat16(34.0f), MLFloat16(35.0f),
+
+ // C = 9
+ MLFloat16(36.0f), MLFloat16(37.0f),
+ MLFloat16(38.0f), MLFloat16(39.0f),
+
+ // C = 10
+ MLFloat16(40.0f), MLFloat16(41.0f),
+ MLFloat16(42.0f), MLFloat16(43.0f),
+
+ // C = 11
+ MLFloat16(44.0f), MLFloat16(45.0f),
+ MLFloat16(46.0f), MLFloat16(47.0f),
+
+ // C = 12
+ MLFloat16(48.0f), MLFloat16(49.0f),
+ MLFloat16(50.0f), MLFloat16(51.0f),
+
+ // C = 13
+ MLFloat16(52.0f), MLFloat16(53.0f),
+ MLFloat16(54.0f), MLFloat16(55.0f),
+
+ // C = 14
+ MLFloat16(56.0f), MLFloat16(57.0f),
+ MLFloat16(58.0f), MLFloat16(59.0f)};
+ vector X_shape = {1, 15, 2, 2};
+ vector W = {
+ // M = 0
+ MLFloat16(0.0f), MLFloat16(1.0f),
+ MLFloat16(2.0f), MLFloat16(3.0f),
+
+ // M = 1
+ MLFloat16(4.0f), MLFloat16(5.0f),
+ MLFloat16(6.0f), MLFloat16(7.0f),
+
+ // M = 2
+ MLFloat16(8.0f), MLFloat16(9.0f),
+ MLFloat16(10.0f), MLFloat16(11.0f),
+
+ // M = 3
+ MLFloat16(12.0f), MLFloat16(13.0f),
+ MLFloat16(14.0f), MLFloat16(15.0f),
+
+ // M = 4
+ MLFloat16(16.0f), MLFloat16(17.0f),
+ MLFloat16(18.0f), MLFloat16(19.0f),
+
+ // M = 5
+ MLFloat16(20.0f), MLFloat16(21.0f),
+ MLFloat16(22.0f), MLFloat16(23.0f),
+
+ // M = 6
+ MLFloat16(24.0f), MLFloat16(25.0f),
+ MLFloat16(26.0f), MLFloat16(27.0f),
+
+ // M = 7
+ MLFloat16(28.0f), MLFloat16(29.0f),
+ MLFloat16(30.0f), MLFloat16(31.0f),
+
+ // M = 8
+ MLFloat16(32.0f), MLFloat16(33.0f),
+ MLFloat16(34.0f), MLFloat16(35.0f),
+
+ // M = 9
+ MLFloat16(36.0f), MLFloat16(37.0f),
+ MLFloat16(38.0f), MLFloat16(39.0f),
+
+ // M = 10
+ MLFloat16(40.0f), MLFloat16(41.0f),
+ MLFloat16(42.0f), MLFloat16(43.0f),
+
+ // M = 11
+ MLFloat16(44.0f), MLFloat16(45.0f),
+ MLFloat16(46.0f), MLFloat16(47.0f),
+
+ // M = 12
+ MLFloat16(48.0f), MLFloat16(49.0f),
+ MLFloat16(50.0f), MLFloat16(51.0f),
+
+ // M = 13
+ MLFloat16(52.0f), MLFloat16(53.0f),
+ MLFloat16(54.0f), MLFloat16(55.0f),
+
+ // M = 14
+ MLFloat16(56.0f), MLFloat16(57.0f),
+ MLFloat16(58.0f), MLFloat16(59.0f)};
+ vector W_shape = {15, 1, 2, 2};
+ vector B = {
+ MLFloat16(101.0f),
+ MLFloat16(102.0f),
+ MLFloat16(103.0f),
+ MLFloat16(104.0f),
+ MLFloat16(105.0f),
+ MLFloat16(106.0f),
+ MLFloat16(107.0f),
+ MLFloat16(108.0f),
+ MLFloat16(109.0f),
+ MLFloat16(110.0f),
+ MLFloat16(111.0f),
+ MLFloat16(112.0f),
+ MLFloat16(113.0f),
+ MLFloat16(114.0f),
+ MLFloat16(115.0f)};
+ vector B_shape = {15};
+ vector Y_shape = {1, 15, 1, 1};
+ auto expected_vals = {
+ MLFloat16(115.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0
+ MLFloat16(228.0f),
+ MLFloat16(469.0f),
+ MLFloat16(838.0f),
+ MLFloat16(1335.0f),
+ MLFloat16(1960.0f),
+ MLFloat16(2713.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0
+ MLFloat16(3594.0f),
+ MLFloat16(4603.0f),
+ MLFloat16(5740.0f),
+ MLFloat16(7005.0f),
+ MLFloat16(8398.0f),
+ MLFloat16(9919.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0
+ MLFloat16(11568.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0
+ MLFloat16(13345.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0
+ };
+
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
TEST(ConvFp16Test, ConvDimWithZero) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
@@ -1074,4 +1309,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) {
} // namespace test
} // namespace onnxruntime
-#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
\ No newline at end of file
+#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 0efa78af2795c..2d885ee9d479f 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -647,6 +647,241 @@ TEST(ConvTest, Conv2D_group) {
TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
}
+TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 1, // group
+ vector{1, 1}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {1.0f};
+ vector X_shape = {1, 1, 1, 1};
+ vector W = {0.5f};
+ vector W_shape = {1, 1, 1, 1};
+ vector B = {0.5f};
+ vector B_shape = {1};
+ vector Y_shape = {1, 1, 1, 1};
+ auto expected_vals = {1.0f};
+
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group2) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 2, // group
+ vector{1, 1}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {
+ 0.0f, 1.0f, 2.0f,
+ 3.0f, 4.0f, 5.0f,
+ 6.0f, 7.0f, 8.0f,
+
+ 9.0f, 10.0f, 11.0f,
+ 12.0f, 13.0f, 14.0f,
+ 15.0f, 16.0f, 17.0f};
+ vector X_shape = {1, 2, 3, 3};
+ vector W = {1.0f, 2.0f};
+ vector W_shape = {2, 1, 1, 1};
+ vector B = {1.0f, -1.0f};
+ vector B_shape = {2};
+ vector Y_shape = {1, 2, 3, 3};
+ auto expected_vals = {
+ 1.0f, 2.0f, 3.0f,
+ 4.0f, 5.0f, 6.0f,
+ 7.0f, 8.0f, 9.0f,
+
+ 17.0f, 19.0f, 21.0f,
+ 23.0f, 25.0f, 27.0f,
+ 29.0f, 31.0f, 33.0f};
+
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group15) {
+ ConvOpAndTestAttributes attrs = {
+ "", // auto_pad
+ vector{1, 1}, // dilations
+ 15, // group
+ vector{2, 2}, // kernel_shape
+ vector{0, 0, 0, 0}, // pads
+ vector{1, 1}, // strides
+ {} // excluded EPs
+ };
+
+ vector X = {
+ // C = 0
+ 0.0f, 1.0f,
+ 2.0f, 3.0f,
+
+ // C = 1
+ 4.0f, 5.0f,
+ 6.0f, 7.0f,
+
+ // C = 2
+ 8.0f, 9.0f,
+ 10.0f, 11.0f,
+
+ // C = 3
+ 12.0f, 13.0f,
+ 14.0f, 15.0f,
+
+ // C = 4
+ 16.0f, 17.0f,
+ 18.0f, 19.0f,
+
+ // C = 5
+ 20.0f, 21.0f,
+ 22.0f, 23.0f,
+
+ // C = 6
+ 24.0f, 25.0f,
+ 26.0f, 27.0f,
+
+ // C = 7
+ 28.0f, 29.0f,
+ 30.0f, 31.0f,
+
+ // C = 8
+ 32.0f, 33.0f,
+ 34.0f, 35.0f,
+
+ // C = 9
+ 36.0f, 37.0f,
+ 38.0f, 39.0f,
+
+ // C = 10
+ 40.0f, 41.0f,
+ 42.0f, 43.0f,
+
+ // C = 11
+ 44.0f, 45.0f,
+ 46.0f, 47.0f,
+
+ // C = 12
+ 48.0f, 49.0f,
+ 50.0f, 51.0f,
+
+ // C = 13
+ 52.0f, 53.0f,
+ 54.0f, 55.0f,
+
+ // C = 14
+ 56.0f, 57.0f,
+ 58.0f, 59.0f};
+ vector X_shape = {1, 15, 2, 2};
+ vector W = {
+ // M = 0
+ 0.0f, 1.0f,
+ 2.0f, 3.0f,
+
+ // M = 1
+ 4.0f, 5.0f,
+ 6.0f, 7.0f,
+
+ // M = 2
+ 8.0f, 9.0f,
+ 10.0f, 11.0f,
+
+ // M = 3
+ 12.0f, 13.0f,
+ 14.0f, 15.0f,
+
+ // M = 4
+ 16.0f, 17.0f,
+ 18.0f, 19.0f,
+
+ // M = 5
+ 20.0f, 21.0f,
+ 22.0f, 23.0f,
+
+ // M = 6
+ 24.0f, 25.0f,
+ 26.0f, 27.0f,
+
+ // M = 7
+ 28.0f, 29.0f,
+ 30.0f, 31.0f,
+
+ // M = 8
+ 32.0f, 33.0f,
+ 34.0f, 35.0f,
+
+ // M = 9
+ 36.0f, 37.0f,
+ 38.0f, 39.0f,
+
+ // M = 10
+ 40.0f, 41.0f,
+ 42.0f, 43.0f,
+
+ // M = 11
+ 44.0f, 45.0f,
+ 46.0f, 47.0f,
+
+ // M = 12
+ 48.0f, 49.0f,
+ 50.0f, 51.0f,
+
+ // M = 13
+ 52.0f, 53.0f,
+ 54.0f, 55.0f,
+
+ // M = 14
+ 56.0f, 57.0f,
+ 58.0f, 59.0f};
+ vector W_shape = {15, 1, 2, 2};
+ vector B = {
+ 101.0f,
+ 102.0f,
+ 103.0f,
+ 104.0f,
+ 105.0f,
+ 106.0f,
+ 107.0f,
+ 108.0f,
+ 109.0f,
+ 110.0f,
+ 111.0f,
+ 112.0f,
+ 113.0f,
+ 114.0f,
+ 115.0f};
+ vector B_shape = {15};
+ vector Y_shape = {1, 15, 1, 1};
+ auto expected_vals = {
+ 115.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0
+ 228.0f,
+ 469.0f,
+ 838.0f,
+ 1335.0f,
+ 1960.0f,
+ 2713.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0
+ 3594.0f,
+ 4603.0f,
+ 5740.0f,
+ 7005.0f,
+ 8398.0f,
+ 9919.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0
+ 11568.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0
+ 13345.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0
+ };
+
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
TEST(ConvTest, ConvDimWithZero) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad