microsoft · yuslepukhin · Aug 13, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -103,6 +103,13 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
   if (y->Shape().Size() == 0)
     return Status::OK();
 
+  if (helper.K() == 0) {
+    // When we have (M, 0, N) then the inputs are empty, but the output should
+    // be filled out with zeros.
+    memset(y->MutableDataRaw(), 0, y->SizeInBytes());
+    return Status::OK();
+  }
+
   // Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common
   // operator body.
   const auto* a_data = reinterpret_cast<const T*>(a->DataRaw());

diff --git a/onnxruntime/core/providers/cuda/math/matmul.cc b/onnxruntime/core/providers/cuda/math/matmul.cc
@@ -110,7 +110,16 @@ Status MatMul<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   Tensor* Y = ctx->Output(0, helper.OutputShape());
   // Bail out early if the output is going to be empty
-  if (Y->Shape().Size() == 0) return Status::OK();
+  const auto output_size = Y->Shape().Size();
+  if (output_size == 0) return Status::OK();
+
+  if (helper.K() == 0) {
+    // When we have (M, 0, N) then the inputs are empty, but the output should
+    // be filled out with zeros.
+    using CudaT = typename ToCudaType<T>::MappedType;
+    Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(Y->MutableData<T>()), CudaT(0.f), narrow<int64_t>(output_size));
+    return Status::OK();
+  }
 
   if (GetTuningContext()->IsTunableOpEnabled()) {
     return tunable::TunableMatMul<T>(alpha_, trans_a, trans_b, trans_batch_a_, trans_batch_b_, helper, this, ctx);

diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -219,6 +219,27 @@ TEST(MathOpTest, MatMulUint64Type) {
   RunMatMulTest<uint64_t>(9);
 }
 
+TEST(MathOpTest, MatMul_ZeroK) {
+  // test with empty inputs and zero filled output
+  constexpr const std::array<float, 0> empty_input{};
+  const std::vector<float> expected_output{0, 0, 0, 0,
+                                           0, 0, 0, 0,
+                                           0, 0, 0, 0,
+                                           0, 0, 0, 0};
+  OpTester test("MatMul", 14);
+
+  test.AddInput<float>("A", {4, 0}, empty_input);
+  test.AddInput<float>("B", {0, 4}, empty_input);
+  test.AddOutput<float>("Y", {4, 4}, expected_output);
+
+  // No special case is implemented.
+  test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider,
+                         kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
+                         kOpenVINOExecutionProvider})
+      .Config(run_with_tunable_op)
+      .RunWithConfig();
+}
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, MatMul_Float16) {
 #ifdef USE_CUDA