From c74409ea8ee1cfbb295d0d1d4a7e8e116c29257f Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Wed, 24 Jan 2024 22:30:33 -0800
Subject: [PATCH 1/7] Working FP32 tests

---
 .../matmul_integer_to_float_test.cc           | 554 ++++++++++++++++--
 1 file changed, 508 insertions(+), 46 deletions(-)
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index d1883815c1a6f..53af9ae43eaef 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -32,40 +32,61 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
                               bool has_zp = true,
                               bool has_bias = false) {
   // create rand inputs
-  RandomValueGenerator random{};
+  RandomValueGenerator random{2502124740};
+  per_column = reference_model.length() < 0;
 
   std::vector<IType> A_data;
-  std::vector<int> tmp_A_data = random.Uniform<int32_t>(A_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
-  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> WType {
+  std::vector<IType> tmp_A_data = random.Uniform<IType>(A_dims,
+                                                        std::numeric_limits<IType>::lowest(),
+                                                        std::numeric_limits<IType>::max());
+  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType {
+    //v = 1;
     return static_cast<IType>(v);
   });
 
   std::vector<WType> B_data;
-  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims,
+
+//#if defined(USE_DML)
+//  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims,
+//                                                        (constexpr(std::is_same_v<WType, int8_t>) ? -2 : 1),
+//                                                        5);
+//#else
+  std::vector<WType> tmp_B_data = random.Uniform<WType>(B_dims,
                                                         std::numeric_limits<WType>::lowest(),
                                                         std::numeric_limits<WType>::max());
+//#endif
+
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
-    return static_cast<WType>(v);
+    //v = 1;
+      return static_cast<WType>(v);
   });
 
-  std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
+  //std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
+  std::vector<OType> A_scale(1, static_cast<OType>(1.0f));
   std::vector<IType> A_zero_point{(std::numeric_limits<IType>::lowest() + std::numeric_limits<IType>::max() + IType(2)) / 2};
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
+  //int64_t b_scale_zp_size = B_dims.back();
+  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), static_cast<OType>(-0.1f), static_cast<OType>(0.1f));
+  //std::vector<OType> B_scale (b_scale_zp_size, static_cast<OType>(1.0f));
+
+  //std::vector<WType> B_zero_point(b_scale_zp_size, 1);
 
   std::vector<WType> B_zero_point(b_scale_zp_size);
-  std::for_each(B_zero_point.begin(),
-                B_zero_point.end(),
-                [&random](WType& zp) {
-                  zp = static_cast<WType>(random.Uniform<int32_t>(std::array<int64_t, 1>{1},
+  if (has_zp) {
+    std::for_each(B_zero_point.begin(),
+                  B_zero_point.end(),
+                  [&random](WType& zp) {
+                    zp = static_cast<WType>(random.Uniform<WType>(std::array<int64_t, 1>{1},
                                                                   std::numeric_limits<WType>::lowest(),
-                                                                  std::numeric_limits<WType>::max())[0]);
-                });
+                                                                  std::numeric_limits<WType>::max() / 2)[0]);
+                  });
+  } else {
+    B_zero_point = {0};
+  }
 
-  std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
+  //std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
+  std::vector<OType> Bias(B_dims.back(), static_cast<OType>(0.0f));
 
   OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
   test.AddInput<IType>("A", A_dims, A_data);
@@ -77,7 +98,7 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
     test.AddInput<IType>("a_zero_point", {1}, A_zero_point);
     test.AddInput<WType>("b_zero_point", {b_scale_zp_size}, B_zero_point);
   } else {
-    test.AddOptionalInputEdge<WType>();
+    test.AddOptionalInputEdge<IType>();
     test.AddOptionalInputEdge<WType>();
   }
 
@@ -86,19 +107,68 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
   } else {
     test.AddOptionalInputEdge<OType>();
   }
-
-  test.AddReferenceOutputs(reference_model);
-#if defined(USE_DML)
-  if constexpr (std::is_same_v<OType, float>) {
-    test.SetOutputRelErr("Y", 2e-2f);
-  } else {
-    test.SetOutputRelErr("Y", 2.0f);
-  }
-#else
-  test.SetOutputRelErr("Y", 1e-4f);
-#endif
-
-  if constexpr (std::is_same_v<OType, float>) {
+  int64_t M = 10;
+  int64_t N = 10;
+  int64_t K = 10;
+  std::vector<float> expected_vals(M * N);
+
+  //if (constexpr(std::is_same_v<OType, float>))
+  //{
+    for (int64_t m = 0; m < M; m++) {
+      for (int64_t n = 0; n < N; n++) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; k++) {
+          float AIntermediate = has_zp ? (A_data[m * K + k] - A_zero_point[0]) : A_data[m * K + k];
+          float BIntermediate = has_zp ? (B_data[k * N + n] - B_zero_point[0]) : B_data[k * N + n];
+          sum += (AIntermediate * A_scale[0]) * (BIntermediate * B_scale[0]);
+        }
+        if (has_bias) {
+          // sum += Bias[m * N + n];
+          sum += Bias[n];
+        }
+        expected_vals[m * N + n] = static_cast<OType>(sum);
+      }
+    }
+    if (constexpr(std::is_same_v<OType, float>)) {
+      test.AddOutput<float>("Y", {M, N}, expected_vals);
+    } else {
+    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
+    }
+  //} else {
+  //  MLFloat16 AZP = static_cast<MLFloat16>(A_zero_point[0]);
+  //  MLFloat16 BZP = static_cast<MLFloat16>(B_zero_point[0]);
+  //  for (int64_t m = 0; m < M; m++) {
+  //    for (int64_t n = 0; n < N; n++) {
+  //      MLFloat16 sum = static_cast<MLFloat16>(0.0f);
+  //      for (int64_t k = 0; k < K; k++) {
+  //        MLFloat16 AIntermediate = (has_zp ? (A_data[m * K + k] - AZP) : A_data[m * K + k]);
+  //        MLFloat16 BIntermediate = (has_zp ? (B_data[k * N + n] - BZP) : B_data[k * N + n]);
+  //        sum += (AIntermediate * A_scale[0]) * (BIntermediate * B_scale[0]);
+  //      }
+  //      if (has_bias) {
+  //        // sum += Bias[m * N + n];
+  //        sum += static_cast<MLFloat16>(Bias[n]);
+  //      }
+  //      expected_vals[m * N + n] = static_cast<OType>(sum);
+  //    }
+  //  }
+  //  test.AddOutput<MLFloat16>("Y", {M, N}, expected_vals);
+  //}
+
+  //test.AddReferenceOutputs(reference_model);
+//#if defined(USE_DML)
+//  if constexpr (std::is_same_v<OType, float>) {
+//    test.SetOutputRelErr("Y", 2e-2f);
+//  } else {
+//    //test.SetOutputRelErr("Y", 1.0f);
+//    test.SetOutputAbsErr("Y", 0.5f);
+//    //test.SetOutputRelErr("Y", 2e-2f);
+//  }
+//#else
+//  test.SetOutputRelErr("Y", 1e-4f);
+//#endif
+
+  if (constexpr(std::is_same_v<OType, float>) && constexpr(std::is_same_v<IType, uint8_t>) && constexpr(std::is_same_v<WType, uint8_t>)) {
     test.Run();
   } else {
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
@@ -107,9 +177,9 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
 
 template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
 void RunMatMulIntegerToFloatTest(const string& model_path) {
-  std::vector<int64_t> A_dims{4, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{4, 128};
+  std::vector<int64_t> A_dims{10, 10};
+  std::vector<int64_t> B_dims{10, 10};
+  std::vector<int64_t> Y_dims{10, 10};
 
   TestMatMulIntegerToFloat<IType, WType, OType>(
       A_dims,
@@ -153,43 +223,198 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
 }
 
 #if USE_DML
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+//TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
+//  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
+//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8X8_FP16) {
+//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) {
+//  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_bias.onnx");
+//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8_FP16) {
+//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
+//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
+//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8_FP16) {
+//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+//}
+//
+//TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8_FP16) {
+//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+//}
+#endif  // USE_DML
+
+#if USE_DML
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_bias.onnx");
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_bias.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
 }
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
 }
 
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
 }
-#endif  // USE_DML
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8_FP16) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+}
+
+#endif
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8_FP16) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#if USE_DML
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>("testdata/matmul_integer_to_float_uint8.onnx");
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) {
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>("testdata/matmul_integer_to_float_int8_bias.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>("testdata/matmul_integer_to_float_uint8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
 }
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
   RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
 }
 
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+}
+
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
   RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
 }
 
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+}
+
+#endif
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>("testdata/matmul_integer_to_float_uint8.onnx");
+}
+
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>("testdata/matmul_integer_to_float_uint8.onnx");
+}
+
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+}
+
 TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   auto test_case = [&](const std::vector<int64_t>& input_shape,
                        const std::vector<int64_t>& weights_shape,
@@ -253,5 +478,242 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
+TEST(MatMulIntegerToFloat, CustomMatMul) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 2;
+
+  std::vector<uint8_t> AMatrix = {1, 1,
+                              1, 1};
+  std::vector<uint8_t> BMatrix = {1, 1,
+                              1, 1};
+  test.AddInput<uint8_t>("A", {M,K}, AMatrix);
+  test.AddInput<uint8_t>("B", {N,K}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {1.0f});
+  test.AddInput<float>("b_scale", {}, {1.0f});
+  //test.AddInput<uint8_t>("a_zero_point", {}, {113});
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += AMatrix[m * K + k] * BMatrix[k * N + n];
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  test.AddOutput<float>("Y", {M , N}, expected_vals);
+
+  test.Run();
+}
+
+TEST(MatMulIntegerToFloat, CustomZPMatMul) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 2;
+
+  std::vector<uint8_t> AMatrix = {1, 1,
+                                  1, 1};
+  std::vector<int8_t> BMatrix = {1, -1,
+                                  1, 1};
+  float AScale = 1.0f;
+  float BScale = 1.0f;
+
+  uint8_t AZP = 113;
+  int8_t BZP = -16;
+
+  test.AddInput<uint8_t>("A", {M, K}, AMatrix);
+  test.AddInput<int8_t>("B", {N, K}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {AScale});
+  test.AddInput<float>("b_scale", {}, {BScale});
+  test.AddInput<uint8_t>("a_zero_point", {}, {AZP});
+  test.AddInput<int8_t>("b_zero_point", {}, {BZP});
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  test.Run();
+}
+
+TEST(MatMulIntegerToFloat, CustomScaleMatMul) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 2;
+
+  std::vector<uint8_t> AMatrix = {1, 1,
+                                  1, 1};
+  std::vector<uint8_t> BMatrix = {1, 1,
+                                  1, 1};
+  float AScale = 0.910f;
+  float BScale = 1.10f;
+
+  uint8_t AZP = 1;
+  uint8_t BZP= 1;
+
+  test.AddInput<uint8_t>("A", {M, K}, AMatrix);
+  test.AddInput<uint8_t>("B", {N, K}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {AScale});
+  test.AddInput<float>("b_scale", {}, {BScale});
+  test.AddInput<uint8_t>("a_zero_point", {}, {AZP});
+  test.AddInput<uint8_t>("b_zero_point", {}, {BZP});
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  test.Run();
+}
+
+TEST(MatMulIntegerToFloat, CustomMatMul1) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 2;
+
+  std::vector<int8_t> AMatrix = {11, -2,
+                                  -1, 3};
+  std::vector<int8_t> BMatrix = {-13, -2,
+                                 -1, 23};
+  float AScale = 0.910f;
+  float BScale = 1.10f;
+
+  int8_t AZP = 113;
+  int8_t BZP = 98;
+
+  test.AddInput<int8_t>("A", {M, K}, AMatrix);
+  test.AddInput<int8_t>("B", {N, K}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {AScale});
+  test.AddInput<float>("b_scale", {}, {BScale});
+  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
+  test.AddInput<int8_t>("b_zero_point", {}, {BZP});
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  test.Run();
+}
+
+TEST(MatMulIntegerToFloat, CustomMatMul2) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 2;
+
+  std::vector<int8_t> AMatrix = {11, -2,
+                                 -1, 3};
+  std::vector<int8_t> BMatrix = {-13, -2,
+                                 -1, 23};
+  float AScale = 0.910f;
+  std::vector<float> BScale = {1.10f, 1.123f};
+
+  int8_t AZP = 113;
+  std::vector<int8_t> BZP = {98, 71};
+
+  test.AddInput<int8_t>("A", {M, K}, AMatrix);
+  test.AddInput<int8_t>("B", {K, N}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {AScale});
+  test.AddInput<float>("b_scale", {N}, BScale);
+  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
+  test.AddInput<int8_t>("b_zero_point", {N}, BZP);
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP[n]) * BScale[n]);
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  test.Run();
+}
+
+TEST(MatMulIntegerToFloat, CustomBiasMatMul) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 3;
+
+  std::vector<int8_t> AMatrix = {11, -2, 5,
+                                 -1, 3, 10};
+  std::vector<int8_t> BMatrix = {-13, -2,
+                                 9, 55,
+                                 -1, 23};
+  float AScale = 0.910f;
+  std::vector<float> BScale = {1.10f, 1.123f};
+
+  int8_t AZP = 113;
+  std::vector<int8_t> BZP = {98, 71};
+
+  std::vector<float> Bias = {0.10f, 1.123f};
+
+  test.AddInput<int8_t>("A", {M, K}, AMatrix);
+  test.AddInput<int8_t>("B", {K, N}, BMatrix);
+
+  test.AddInput<float>("a_scale", {}, {AScale});
+  test.AddInput<float>("b_scale", {N}, BScale);
+  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
+  test.AddInput<int8_t>("b_zero_point", {N}, BZP);
+  test.AddInput<float>("bias", {N}, Bias);
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP[n]) * BScale[n]);
+      }
+      expected_vals[m * N + n] = sum + Bias[n];
+    }
+  }
+
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 23cce714a38d91cb7f48e4958da3d8d9b5e5e0ba Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Mon, 29 Jan 2024 22:40:33 -0800
Subject: [PATCH 2/7] Working tests all

---
 .../src/External/DirectMLHelpers/ApiTraits.h  |   2 +-
 .../dml/OperatorAuthorHelper/OperatorHelper.h |  10 +-
 .../matmul_integer_to_float_test.cc           | 713 ++++++------------
 .../test/testdata/matmul_integer_to_float.py  |  30 +-
 .../matmul_integer_to_float16_int8.onnx       |  51 --
 .../matmul_integer_to_float16_int8_bias.onnx  |  49 --
 .../matmul_integer_to_float16_int8_int8.onnx  |  51 --
 ...mul_integer_to_float16_int8_int8_bias.onnx |  49 --
 .../matmul_integer_to_float16_int8.onnx       |  51 --
 9 files changed, 240 insertions(+), 766 deletions(-)
 delete mode 100644 onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
 delete mode 100644 onnxruntime/test/testdata/matmul_integer_to_float16_int8_bias.onnx
 delete mode 100644 onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8.onnx
 delete mode 100644 onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8_bias.onnx
 delete mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index a7a6e59e400ef..3c0f49f3d2d49 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -882,7 +882,7 @@ struct OperatorDescTraits<DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_DESC>
 template <>
 struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
 {
-    static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
 };
 
 template <>
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 7146edd861257..d5a66a74237ee 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -826,14 +826,6 @@ class QLinearMatMulHelper : public MatMulHelperBase
     QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {}
 };
 
-class MatMulIntegerToFloatHelper : public MatMulHelperBase
-{
-public:
-    template<typename Info_t, typename Shape_t>
-    MatMulIntegerToFloatHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 1) {}
-};
-
-
 class TopKHelper
 {
     void Initialize(
@@ -1752,7 +1744,7 @@ using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MatMul = MatMulHelper;
 using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
 using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper;
-using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulIntegerToFloatHelper;
+using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
 using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
 using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
 using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 53af9ae43eaef..49560b8ff268a 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -24,69 +24,78 @@ namespace onnxruntime {
 namespace test {
 
 template <typename IType, typename WType, typename OType>
-void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
-                              std::vector<int64_t> B_dims,
-                              const std::string& reference_model,
-                              bool is_matrix_b_constant,
+static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, const std::vector<IType>& A_data, const std::vector<OType>& A_scale, const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data, std::vector<OType>& B_scale, std::vector<WType>& B_zero_point, const 
+  std::vector<OType>& Bias, std::vector<float>& Y_data, bool per_column, bool has_zp, bool has_bias) {
+ 
+  if (!per_column) {
+    B_zero_point.resize(N, B_zero_point[0]);
+    B_scale.resize(N, B_scale[0]);
+  }
+
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        float A_dequantized = has_zp ? (A_data[m * K + k] - A_zero_point[0]) * A_scale[0] : A_data[m * K + k] * A_scale[0];
+        float B_dequantized = has_zp ? (B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+        sum += A_dequantized * B_dequantized;
+      }
+      if (has_bias) {
+        sum += Bias[n];
+      }
+      Y_data[m * N + n] = static_cast<OType>(sum);
+    }
+  }
+}
+
+template <typename IType, typename WType, typename OType>
+void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
                               bool per_column = false,
                               bool has_zp = true,
                               bool has_bias = false) {
   // create rand inputs
-  RandomValueGenerator random{2502124740};
-  per_column = reference_model.length() < 0;
-
+  RandomValueGenerator random{};
+  int64_t M = 4;
+  int64_t N = 128;
+  int64_t K = 128;
+  std::vector<int64_t> A_dims{M, K};
+  std::vector<int64_t> B_dims{K, N};
+  std::vector<int64_t> Y_dims{M, K};
   std::vector<IType> A_data;
   std::vector<IType> tmp_A_data = random.Uniform<IType>(A_dims,
                                                         std::numeric_limits<IType>::lowest(),
                                                         std::numeric_limits<IType>::max());
   std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType {
-    //v = 1;
     return static_cast<IType>(v);
   });
 
   std::vector<WType> B_data;
 
-//#if defined(USE_DML)
-//  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims,
-//                                                        (constexpr(std::is_same_v<WType, int8_t>) ? -2 : 1),
-//                                                        5);
-//#else
   std::vector<WType> tmp_B_data = random.Uniform<WType>(B_dims,
                                                         std::numeric_limits<WType>::lowest(),
                                                         std::numeric_limits<WType>::max());
-//#endif
 
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
-    //v = 1;
       return static_cast<WType>(v);
   });
 
-  //std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
-  std::vector<OType> A_scale(1, static_cast<OType>(1.0f));
+  std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
   std::vector<IType> A_zero_point{(std::numeric_limits<IType>::lowest() + std::numeric_limits<IType>::max() + IType(2)) / 2};
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  //int64_t b_scale_zp_size = B_dims.back();
-  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), static_cast<OType>(-0.1f), static_cast<OType>(0.1f));
-  //std::vector<OType> B_scale (b_scale_zp_size, static_cast<OType>(1.0f));
-
-  //std::vector<WType> B_zero_point(b_scale_zp_size, 1);
+  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
 
   std::vector<WType> B_zero_point(b_scale_zp_size);
-  if (has_zp) {
-    std::for_each(B_zero_point.begin(),
-                  B_zero_point.end(),
-                  [&random](WType& zp) {
-                    zp = static_cast<WType>(random.Uniform<WType>(std::array<int64_t, 1>{1},
-                                                                  std::numeric_limits<WType>::lowest(),
-                                                                  std::numeric_limits<WType>::max() / 2)[0]);
-                  });
-  } else {
-    B_zero_point = {0};
-  }
+  std::for_each(B_zero_point.begin(),
+                B_zero_point.end(),
+                [&random](WType& zp) {
+                  zp = static_cast<WType>(random.Uniform<WType>(std::array<int64_t, 1>{1},
+                                                                std::numeric_limits<WType>::lowest(),
+                                                                std::numeric_limits<WType>::max())[0]);
+                });
 
-  //std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
-  std::vector<OType> Bias(B_dims.back(), static_cast<OType>(0.0f));
+  std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
   test.AddInput<IType>("A", A_dims, A_data);
@@ -107,84 +116,33 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
   } else {
     test.AddOptionalInputEdge<OType>();
   }
-  int64_t M = 10;
-  int64_t N = 10;
-  int64_t K = 10;
-  std::vector<float> expected_vals(M * N);
-
-  //if (constexpr(std::is_same_v<OType, float>))
-  //{
-    for (int64_t m = 0; m < M; m++) {
-      for (int64_t n = 0; n < N; n++) {
-        float sum = 0.0f;
-        for (int64_t k = 0; k < K; k++) {
-          float AIntermediate = has_zp ? (A_data[m * K + k] - A_zero_point[0]) : A_data[m * K + k];
-          float BIntermediate = has_zp ? (B_data[k * N + n] - B_zero_point[0]) : B_data[k * N + n];
-          sum += (AIntermediate * A_scale[0]) * (BIntermediate * B_scale[0]);
-        }
-        if (has_bias) {
-          // sum += Bias[m * N + n];
-          sum += Bias[n];
-        }
-        expected_vals[m * N + n] = static_cast<OType>(sum);
-      }
-    }
-    if (constexpr(std::is_same_v<OType, float>)) {
-      test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias);
+
+    if ( constexpr(std::is_same_v<OType, float>)) {
+      test.AddOutput<float>("Y", {M, N}, Y_data);
     } else {
-    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
+      test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+      test.SetOutputAbsErr("Y", 0.5f);
     }
-  //} else {
-  //  MLFloat16 AZP = static_cast<MLFloat16>(A_zero_point[0]);
-  //  MLFloat16 BZP = static_cast<MLFloat16>(B_zero_point[0]);
-  //  for (int64_t m = 0; m < M; m++) {
-  //    for (int64_t n = 0; n < N; n++) {
-  //      MLFloat16 sum = static_cast<MLFloat16>(0.0f);
-  //      for (int64_t k = 0; k < K; k++) {
-  //        MLFloat16 AIntermediate = (has_zp ? (A_data[m * K + k] - AZP) : A_data[m * K + k]);
-  //        MLFloat16 BIntermediate = (has_zp ? (B_data[k * N + n] - BZP) : B_data[k * N + n]);
-  //        sum += (AIntermediate * A_scale[0]) * (BIntermediate * B_scale[0]);
-  //      }
-  //      if (has_bias) {
-  //        // sum += Bias[m * N + n];
-  //        sum += static_cast<MLFloat16>(Bias[n]);
-  //      }
-  //      expected_vals[m * N + n] = static_cast<OType>(sum);
-  //    }
-  //  }
-  //  test.AddOutput<MLFloat16>("Y", {M, N}, expected_vals);
-  //}
-
-  //test.AddReferenceOutputs(reference_model);
-//#if defined(USE_DML)
-//  if constexpr (std::is_same_v<OType, float>) {
-//    test.SetOutputRelErr("Y", 2e-2f);
-//  } else {
-//    //test.SetOutputRelErr("Y", 1.0f);
-//    test.SetOutputAbsErr("Y", 0.5f);
-//    //test.SetOutputRelErr("Y", 2e-2f);
-//  }
-//#else
-//  test.SetOutputRelErr("Y", 1e-4f);
-//#endif
-
-  if (constexpr(std::is_same_v<OType, float>) && constexpr(std::is_same_v<IType, uint8_t>) && constexpr(std::is_same_v<WType, uint8_t>)) {
+
+  // Only DML EP supports these data type combinations for now
+  if ((constexpr(std::is_same_v<OType, MLFloat16>)) ||
+      (constexpr(std::is_same_v<OType, float>) &&
+         /*(constexpr(std::is_same_v<IType, uint8_t>) &&*/  !constexpr(std::is_same_v<WType, IType>))
+          ) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultDmlExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    } else {
     test.Run();
-  } else {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
   }
 }
 
 template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
-void RunMatMulIntegerToFloatTest(const string& model_path) {
-  std::vector<int64_t> A_dims{10, 10};
-  std::vector<int64_t> B_dims{10, 10};
-  std::vector<int64_t> Y_dims{10, 10};
-
+void RunMatMulIntegerToFloatTest() {
   TestMatMulIntegerToFloat<IType, WType, OType>(
-      A_dims,
-      B_dims,
-      model_path,
       false,        /*is_matrix_b_constant*/
       false,        /*per_column*/
       HasZeroPoint, /*has_zp*/
@@ -192,9 +150,6 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 
   TestMatMulIntegerToFloat<IType, WType, OType>(
-      A_dims,
-      B_dims,
-      model_path,
       true,         /*is_matrix_b_constant*/
       false,        /*per_column*/
       HasZeroPoint, /*has_zp*/
@@ -202,9 +157,6 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 
   TestMatMulIntegerToFloat<IType, WType, OType>(
-      A_dims,
-      B_dims,
-      model_path,
       false,        /*is_matrix_b_constant*/
       true,         /*per_column*/
       HasZeroPoint, /*has_zp*/
@@ -212,9 +164,6 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 
   TestMatMulIntegerToFloat<IType, WType, OType>(
-      A_dims,
-      B_dims,
-      model_path,
       true,         /*is_matrix_b_constant*/
       true,         /*per_column*/
       HasZeroPoint, /*has_zp*/
@@ -222,198 +171,247 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 }
 
-#if USE_DML
-//TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
-//  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
-//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8X8_FP16) {
-//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) {
-//  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_bias.onnx");
-//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8_FP16) {
-//  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
-//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
-//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8_FP16) {
-//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
-//}
-//
-//TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8_FP16) {
-//  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
-//}
-#endif  // USE_DML
-
-#if USE_DML
-
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, true>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, false>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
-}
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
-}
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_int8_int8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
-}
+// DML EP supports Float16 output type and A Matrix and B Matric of different data types for Float32 output
+#if defined(USE_DML)
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8_FP16) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
 }
 
-#endif
-
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, true>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, false, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, false, false>("testdata/matmul_integer_to_float16_uint8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8_FP16) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t , MLFloat16, true, true>("testdata/matmul_integer_to_float16_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>();
 }
 
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>();
+}
 
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>();
+}
 
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>();
+}
 
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
 
+  std::vector<uint8_t> A_data = {1, 5, 2, 1, 9,
+                              1, 1, 3, 7, 2};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({3.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {3};
+  std::vector<uint8_t> B_zero_point = {5};
 
 
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
 
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, {}, Y_data, false, true, false);
 
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
 
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
 
+  std::vector<uint8_t> A_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                 -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
 
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
 
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, {}, Y_data, false, true, false);
 
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
 
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
 
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                 2, -1, -9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
 
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
 
 
-#if USE_DML
-
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
-}
-
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>("testdata/matmul_integer_to_float_int8_bias.onnx");
-}
-
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>("testdata/matmul_integer_to_float_uint8.onnx");
-}
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, false, true, true);
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
-}
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
-}
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
-}
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                2, -1, -9, 1, 1};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<uint8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
-}
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
-}
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, false, true, true);
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) {
-  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
-}
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
-}
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 3;
 
-#endif
+  std::vector<int8_t> A_data = {11, -2, 5,
+                                 -1, 3, 10};
+  std::vector<int8_t> B_data = {-13, -2,
+                                 9, 55,
+                                 -1, 23};
+  std::vector<MLFloat16> A_scale = ToFloat16({0.910f});
+  std::vector<MLFloat16> B_scale = ToFloat16({1.10f, 1.123f});
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>("testdata/matmul_integer_to_float_uint8.onnx");
-}
+  std::vector<int8_t> A_zero_point = {113};
+  std::vector<int8_t> B_zero_point = {98, 71};
 
+  std::vector<MLFloat16> Bias = ToFloat16({0.10f, 1.123f});
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
-}
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>("testdata/matmul_integer_to_float_uint8.onnx");
-}
+  test.AddInput<MLFloat16>("a_scale", {}, {A_scale});
+  test.AddInput<MLFloat16>("b_scale", {N}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {}, {A_zero_point});
+  test.AddInput<int8_t>("b_zero_point", {N}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
 
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, true, true, true);
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  test.SetOutputRelErr("Y", 2e-2f);
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
+#endif
 
 TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   auto test_case = [&](const std::vector<int64_t>& input_shape,
@@ -478,242 +476,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
-TEST(MatMulIntegerToFloat, CustomMatMul) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 2;
-
-  std::vector<uint8_t> AMatrix = {1, 1,
-                              1, 1};
-  std::vector<uint8_t> BMatrix = {1, 1,
-                              1, 1};
-  test.AddInput<uint8_t>("A", {M,K}, AMatrix);
-  test.AddInput<uint8_t>("B", {N,K}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {1.0f});
-  test.AddInput<float>("b_scale", {}, {1.0f});
-  //test.AddInput<uint8_t>("a_zero_point", {}, {113});
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += AMatrix[m * K + k] * BMatrix[k * N + n];
-      }
-      expected_vals[m * N + n] = sum;
-    }
-  }
-
-  test.AddOutput<float>("Y", {M , N}, expected_vals);
-
-  test.Run();
-}
-
-TEST(MatMulIntegerToFloat, CustomZPMatMul) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 2;
-
-  std::vector<uint8_t> AMatrix = {1, 1,
-                                  1, 1};
-  std::vector<int8_t> BMatrix = {1, -1,
-                                  1, 1};
-  float AScale = 1.0f;
-  float BScale = 1.0f;
-
-  uint8_t AZP = 113;
-  int8_t BZP = -16;
-
-  test.AddInput<uint8_t>("A", {M, K}, AMatrix);
-  test.AddInput<int8_t>("B", {N, K}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {AScale});
-  test.AddInput<float>("b_scale", {}, {BScale});
-  test.AddInput<uint8_t>("a_zero_point", {}, {AZP});
-  test.AddInput<int8_t>("b_zero_point", {}, {BZP});
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
-      }
-      expected_vals[m * N + n] = sum;
-    }
-  }
-
-  test.AddOutput<float>("Y", {M, N}, expected_vals);
-
-  test.Run();
-}
-
-TEST(MatMulIntegerToFloat, CustomScaleMatMul) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 2;
-
-  std::vector<uint8_t> AMatrix = {1, 1,
-                                  1, 1};
-  std::vector<uint8_t> BMatrix = {1, 1,
-                                  1, 1};
-  float AScale = 0.910f;
-  float BScale = 1.10f;
-
-  uint8_t AZP = 1;
-  uint8_t BZP= 1;
-
-  test.AddInput<uint8_t>("A", {M, K}, AMatrix);
-  test.AddInput<uint8_t>("B", {N, K}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {AScale});
-  test.AddInput<float>("b_scale", {}, {BScale});
-  test.AddInput<uint8_t>("a_zero_point", {}, {AZP});
-  test.AddInput<uint8_t>("b_zero_point", {}, {BZP});
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
-      }
-      expected_vals[m * N + n] = sum;
-    }
-  }
-
-  test.AddOutput<float>("Y", {M, N}, expected_vals);
-
-  test.Run();
-}
-
-TEST(MatMulIntegerToFloat, CustomMatMul1) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 2;
-
-  std::vector<int8_t> AMatrix = {11, -2,
-                                  -1, 3};
-  std::vector<int8_t> BMatrix = {-13, -2,
-                                 -1, 23};
-  float AScale = 0.910f;
-  float BScale = 1.10f;
-
-  int8_t AZP = 113;
-  int8_t BZP = 98;
-
-  test.AddInput<int8_t>("A", {M, K}, AMatrix);
-  test.AddInput<int8_t>("B", {N, K}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {AScale});
-  test.AddInput<float>("b_scale", {}, {BScale});
-  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
-  test.AddInput<int8_t>("b_zero_point", {}, {BZP});
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP) * BScale);
-      }
-      expected_vals[m * N + n] = sum;
-    }
-  }
-
-  test.AddOutput<float>("Y", {M, N}, expected_vals);
-
-  test.Run();
-}
-
-TEST(MatMulIntegerToFloat, CustomMatMul2) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 2;
-
-  std::vector<int8_t> AMatrix = {11, -2,
-                                 -1, 3};
-  std::vector<int8_t> BMatrix = {-13, -2,
-                                 -1, 23};
-  float AScale = 0.910f;
-  std::vector<float> BScale = {1.10f, 1.123f};
-
-  int8_t AZP = 113;
-  std::vector<int8_t> BZP = {98, 71};
-
-  test.AddInput<int8_t>("A", {M, K}, AMatrix);
-  test.AddInput<int8_t>("B", {K, N}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {AScale});
-  test.AddInput<float>("b_scale", {N}, BScale);
-  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
-  test.AddInput<int8_t>("b_zero_point", {N}, BZP);
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP[n]) * BScale[n]);
-      }
-      expected_vals[m * N + n] = sum;
-    }
-  }
-
-  test.AddOutput<float>("Y", {M, N}, expected_vals);
-
-  test.Run();
-}
-
-TEST(MatMulIntegerToFloat, CustomBiasMatMul) {
-  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
-  int64_t M = 2;
-  int64_t N = 2;
-  int64_t K = 3;
-
-  std::vector<int8_t> AMatrix = {11, -2, 5,
-                                 -1, 3, 10};
-  std::vector<int8_t> BMatrix = {-13, -2,
-                                 9, 55,
-                                 -1, 23};
-  float AScale = 0.910f;
-  std::vector<float> BScale = {1.10f, 1.123f};
-
-  int8_t AZP = 113;
-  std::vector<int8_t> BZP = {98, 71};
-
-  std::vector<float> Bias = {0.10f, 1.123f};
-
-  test.AddInput<int8_t>("A", {M, K}, AMatrix);
-  test.AddInput<int8_t>("B", {K, N}, BMatrix);
-
-  test.AddInput<float>("a_scale", {}, {AScale});
-  test.AddInput<float>("b_scale", {N}, BScale);
-  test.AddInput<int8_t>("a_zero_point", {}, {AZP});
-  test.AddInput<int8_t>("b_zero_point", {N}, BZP);
-  test.AddInput<float>("bias", {N}, Bias);
-
-  std::vector<float> expected_vals(M * N);
-  for (int64_t m = 0; m < M; m++) {
-    for (int64_t n = 0; n < N; n++) {
-      float sum = 0.0f;
-      for (int64_t k = 0; k < K; k++) {
-        sum += ((AMatrix[m * K + k] - AZP) * AScale) * ((BMatrix[k * N + n] - BZP[n]) * BScale[n]);
-      }
-      expected_vals[m * N + n] = sum + Bias[n];
-    }
-  }
-
-  test.AddOutput<float>("Y", {M, N}, expected_vals);
-
-  test.Run();
-}
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index ac91877a0ea44..36902598aad14 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -78,35 +78,7 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia
 
 
 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
-    GenerateModel("matmul_integer_to_float16_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=True)
-    GenerateModel(
-        "matmul_integer_to_float16_int8_bias.onnx",
-        sign_i=False,
-        sign_w=True,
-        output_type_fp16=True,
-        has_zp=False,
-        bias=True,
-    )
-    GenerateModel(
-        "matmul_integer_to_float16_uint8_bias.onnx",
-        sign_i=False,
-        sign_w=False,
-        output_type_fp16=True,
-        has_zp=False,
-        bias=True,
-    )
-
-    GenerateModel("matmul_integer_to_float16_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=True)
-    GenerateModel(
-        "matmul_integer_to_float16_int8_int8_bias.onnx",
-        sign_i=True,
-        sign_w=True,
-        output_type_fp16=True,
-        has_zp=False,
-        bias=True,
-    )
-
+    #GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
     GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False)
     GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False)
     GenerateModel(
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
deleted file mode 100644
index 22293b0d10756..0000000000000
--- a/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
+++ /dev/null
@@ -1,51 +0,0 @@
-	:�
-U
-A
-B
-a_zero_point
-b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
-.
-a_scale
-b_scale
-multiplier	mul_right"Mul
-A
-matmul_output_int32matmul_output_floatcast"Cast*	
-to
-�
-5
-matmul_output_float
-
-multiplierY
-mul_bottom"MulDynamicQuantizeMatMul_fusionZ
-A
-
-
-M
-KZ
-B
-
-
-K
-NZ
-a_scale
-
-
-
-Z
-b_scale
-	
-
-CZ
-a_zero_point
-
-
-Z
-b_zero_point
-	
-Cb
-Y
-
-
-
-M
-NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float16_int8_bias.onnx
deleted file mode 100644
index b92648e6ac23c..0000000000000
--- a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_bias.onnx
+++ /dev/null
@@ -1,49 +0,0 @@
-	:�
-9
-A
-Bmatmul_output_int32MatMulInteger"MatMulInteger
-.
-a_scale
-b_scale
-multiplier	mul_right"Mul
-A
-matmul_output_int32matmul_output_floatcast"Cast*	
-to
-�
-E
-matmul_output_float
-
-multipliermul_bottom_output
-mul_bottom"Mul
-&
-mul_bottom_output
-biasYadd"AddDynamicQuantizeMatMul_fusionZ
-A
-
-
-M
-KZ
-B
-
-
-K
-NZ
-a_scale
-
-
-
-Z
-b_scale
-	
-
-CZ
-bias
-	
-
-Nb
-Y
-
-
-
-M
-NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8.onnx
deleted file mode 100644
index 3bb5129ba0800..0000000000000
--- a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8.onnx
+++ /dev/null
@@ -1,51 +0,0 @@
-	:�
-U
-A
-B
-a_zero_point
-b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
-.
-a_scale
-b_scale
-multiplier	mul_right"Mul
-A
-matmul_output_int32matmul_output_floatcast"Cast*	
-to
-�
-5
-matmul_output_float
-
-multiplierY
-mul_bottom"MulDynamicQuantizeMatMul_fusionZ
-A
-
-
-M
-KZ
-B
-
-
-K
-NZ
-a_scale
-
-
-
-Z
-b_scale
-	
-
-CZ
-a_zero_point
-
-
-Z
-b_zero_point
-	
-Cb
-Y
-
-
-
-M
-NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8_bias.onnx
deleted file mode 100644
index 76bf3f698fcee..0000000000000
--- a/onnxruntime/test/testdata/matmul_integer_to_float16_int8_int8_bias.onnx
+++ /dev/null
@@ -1,49 +0,0 @@
-	:�
-9
-A
-Bmatmul_output_int32MatMulInteger"MatMulInteger
-.
-a_scale
-b_scale
-multiplier	mul_right"Mul
-A
-matmul_output_int32matmul_output_floatcast"Cast*	
-to
-�
-E
-matmul_output_float
-
-multipliermul_bottom_output
-mul_bottom"Mul
-&
-mul_bottom_output
-biasYadd"AddDynamicQuantizeMatMul_fusionZ
-A
-
-
-M
-KZ
-B
-
-
-K
-NZ
-a_scale
-
-
-
-Z
-b_scale
-	
-
-CZ
-bias
-	
-
-Nb
-Y
-
-
-
-M
-NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
deleted file mode 100644
index 22293b0d10756..0000000000000
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
+++ /dev/null
@@ -1,51 +0,0 @@
-	:�
-U
-A
-B
-a_zero_point
-b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
-.
-a_scale
-b_scale
-multiplier	mul_right"Mul
-A
-matmul_output_int32matmul_output_floatcast"Cast*	
-to
-�
-5
-matmul_output_float
-
-multiplierY
-mul_bottom"MulDynamicQuantizeMatMul_fusionZ
-A
-
-
-M
-KZ
-B
-
-
-K
-NZ
-a_scale
-
-
-
-Z
-b_scale
-	
-
-CZ
-a_zero_point
-
-
-Z
-b_zero_point
-	
-Cb
-Y
-
-
-
-M
-NB
\ No newline at end of file

From b9e5f1544f805db21bedcaa4fa7c0ab465a26fc8 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Tue, 30 Jan 2024 12:08:38 -0800
Subject: [PATCH 3/7] Lintrunner

---
 .../matmul_integer_to_float_test.cc           | 39 ++++++++-----------
 .../test/testdata/matmul_integer_to_float.py  |  2 +-
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 49560b8ff268a..b2e17c5333319 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -24,9 +24,7 @@ namespace onnxruntime {
 namespace test {
 
 template <typename IType, typename WType, typename OType>
-static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, const std::vector<IType>& A_data, const std::vector<OType>& A_scale, const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data, std::vector<OType>& B_scale, std::vector<WType>& B_zero_point, const 
-  std::vector<OType>& Bias, std::vector<float>& Y_data, bool per_column, bool has_zp, bool has_bias) {
- 
+static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, const std::vector<IType>& A_data, const std::vector<OType>& A_scale, const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data, std::vector<OType>& B_scale, std::vector<WType>& B_zero_point, const std::vector<OType>& Bias, std::vector<float>& Y_data, bool per_column, bool has_zp, bool has_bias) {
   if (!per_column) {
     B_zero_point.resize(N, B_zero_point[0]);
     B_scale.resize(N, B_scale[0]);
@@ -77,7 +75,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
                                                         std::numeric_limits<WType>::max());
 
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
-      return static_cast<WType>(v);
+    return static_cast<WType>(v);
   });
 
   std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
@@ -120,22 +118,21 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
   std::vector<float> Y_data(M * N);
   CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias);
 
-    if ( constexpr(std::is_same_v<OType, float>)) {
-      test.AddOutput<float>("Y", {M, N}, Y_data);
-    } else {
-      test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
-      test.SetOutputAbsErr("Y", 0.5f);
-    }
+  if (constexpr(std::is_same_v<OType, float>)) {
+    test.AddOutput<float>("Y", {M, N}, Y_data);
+  } else {
+    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+    test.SetOutputAbsErr("Y", 0.5f);
+  }
 
   // Only DML EP supports these data type combinations for now
   if ((constexpr(std::is_same_v<OType, MLFloat16>)) ||
       (constexpr(std::is_same_v<OType, float>) &&
-         /*(constexpr(std::is_same_v<IType, uint8_t>) &&*/  !constexpr(std::is_same_v<WType, IType>))
-          ) {
+       !constexpr(std::is_same_v<WType, IType>))) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultDmlExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-    } else {
+  } else {
     test.Run();
   }
 }
@@ -191,7 +188,6 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>();
 }
 
-
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>();
 }
@@ -200,7 +196,6 @@ TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>();
 }
 
-
 TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>();
 }
@@ -247,7 +242,7 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
   int64_t K = 2;
 
   std::vector<uint8_t> A_data = {1, 5, 2, 1, 9,
-                              1, 1, 3, 7, 2};
+                                 1, 1, 3, 7, 2};
   std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
                                  2, 1, 9, 1, 1};
   std::vector<MLFloat16> A_scale = ToFloat16({3.0f});
@@ -257,7 +252,6 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
   std::vector<uint8_t> A_zero_point = {3};
   std::vector<uint8_t> B_zero_point = {5};
 
-
   test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
   test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
   test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
@@ -281,7 +275,7 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) {
   std::vector<uint8_t> A_data = {3, 7, 2, 1, 1,
                                  2, 1, 9, 1, 1};
   std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
-                                 -1, 0, -3, 1, -4};
+                                -1, 0, -3, 1, -4};
   std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
   std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
   test.AddInput<uint8_t>("A", {M, K}, A_data);
@@ -312,7 +306,7 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
   int64_t K = 2;
 
   std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
-                                 2, -1, -9, 1, 1};
+                                2, -1, -9, 1, 1};
   std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
                                 -1, 0, -3, 1, -4};
   std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
@@ -329,7 +323,6 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
   test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
   test.AddInput<MLFloat16>("bias", {N}, Bias);
 
-
   std::vector<float> Y_data(M * N);
   CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, false, true, true);
 
@@ -381,10 +374,10 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) {
   int64_t K = 3;
 
   std::vector<int8_t> A_data = {11, -2, 5,
-                                 -1, 3, 10};
+                                -1, 3, 10};
   std::vector<int8_t> B_data = {-13, -2,
-                                 9, 55,
-                                 -1, 23};
+                                9, 55,
+                                -1, 23};
   std::vector<MLFloat16> A_scale = ToFloat16({0.910f});
   std::vector<MLFloat16> B_scale = ToFloat16({1.10f, 1.123f});
 
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index 36902598aad14..0c9ee3f3e6492 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -78,7 +78,7 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia
 
 
 if __name__ == "__main__":
-    #GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
+    # GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
     GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False)
     GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False)
     GenerateModel(

From 453fa9ef3df9e4658a97d4c3882ca391e9633525 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Tue, 30 Jan 2024 16:34:24 -0800
Subject: [PATCH 4/7] add matmul_integer_to_float16_int8.onnx for graph
 transformer test and line characters update

---
 .../matmul_integer_to_float_test.cc           | 37 ++++++++++----
 .../test/testdata/matmul_integer_to_float.py  |  2 +-
 .../matmul_integer_to_float16_int8.onnx       | 51 +++++++++++++++++++
 3 files changed, 80 insertions(+), 10 deletions(-)
 create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx

diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index b2e17c5333319..dc466c933c6d7 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -24,7 +24,12 @@ namespace onnxruntime {
 namespace test {
 
 template <typename IType, typename WType, typename OType>
-static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, const std::vector<IType>& A_data, const std::vector<OType>& A_scale, const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data, std::vector<OType>& B_scale, std::vector<WType>& B_zero_point, const std::vector<OType>& Bias, std::vector<float>& Y_data, bool per_column, bool has_zp, bool has_bias) {
+static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K,
+                                          const std::vector<IType>& A_data, const std::vector<OType>& A_scale,
+                                          const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data,
+                                          std::vector<OType>& B_scale, std::vector<WType>& B_zero_point,
+                                          const std::vector<OType>& Bias, std::vector<float>& Y_data,
+                                          bool per_column, bool has_zp, bool has_bias) {
   if (!per_column) {
     B_zero_point.resize(N, B_zero_point[0]);
     B_scale.resize(N, B_scale[0]);
@@ -34,8 +39,10 @@ static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, cons
     for (int64_t n = 0; n < N; n++) {
       float sum = 0.0f;
       for (int64_t k = 0; k < K; k++) {
-        float A_dequantized = has_zp ? (A_data[m * K + k] - A_zero_point[0]) * A_scale[0] : A_data[m * K + k] * A_scale[0];
-        float B_dequantized = has_zp ? (B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+        float A_dequantized = has_zp ?
+            (A_data[m * K + k] - A_zero_point[0]) * A_scale[0] : A_data[m * K + k] * A_scale[0];
+        float B_dequantized = has_zp ?
+            (B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : B_data[k * N + n] * B_scale[n];
 
         sum += A_dequantized * B_dequantized;
       }
@@ -116,7 +123,9 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
   }
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias);
+  CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point,
+                                                     B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                     per_column, has_zp, has_bias);
 
   if (constexpr(std::is_same_v<OType, float>)) {
     test.AddOutput<float>("Y", {M, N}, Y_data);
@@ -258,7 +267,9 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
   test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<uint8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, {}, Y_data, false, true, false);
+  CalculateMatMulIntegerToFloat<uint8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                             B_data, B_scale, B_zero_point, {}, Y_data,
+                                                             false, true, false);
 
   test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -290,7 +301,9 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) {
   test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<uint8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, {}, Y_data, false, true, false);
+  CalculateMatMulIntegerToFloat<uint8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, {}, Y_data,
+                                                            false, true, false);
 
   test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
@@ -324,7 +337,9 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
   test.AddInput<MLFloat16>("bias", {N}, Bias);
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, false, true, true);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           false, true, true);
 
   test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
@@ -358,7 +373,9 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) {
   test.AddInput<MLFloat16>("bias", {N}, Bias);
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<int8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, false, true, true);
+  CalculateMatMulIntegerToFloat<int8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                            false, true, true);
 
   test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
 
@@ -396,7 +413,9 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) {
   test.AddInput<MLFloat16>("bias", {N}, Bias);
 
   std::vector<float> Y_data(M * N);
-  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point, B_data, B_scale, B_zero_point, Bias, Y_data, true, true, true);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           true, true, true);
 
   test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
   test.SetOutputRelErr("Y", 2e-2f);
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index 0c9ee3f3e6492..e6c51009018f9 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -78,7 +78,7 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia
 
 
 if __name__ == "__main__":
-    # GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
     GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False)
     GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False)
     GenerateModel(
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
new file mode 100644
index 0000000000000..22293b0d10756
--- /dev/null
+++ b/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
@@ -0,0 +1,51 @@
+	:�
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+�
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB
\ No newline at end of file

From af2d24d4f36b70f0f84cfd2aeb449ef83e32a41b Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 8 Feb 2024 09:43:21 -0800
Subject: [PATCH 5/7] Avoid Overflow condistions

---
 .../test/contrib_ops/matmul_integer_to_float_test.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index dc466c933c6d7..adb24dc75f375 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -40,9 +40,11 @@ static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, cons
       float sum = 0.0f;
       for (int64_t k = 0; k < K; k++) {
         float A_dequantized = has_zp ?
-            (A_data[m * K + k] - A_zero_point[0]) * A_scale[0] : A_data[m * K + k] * A_scale[0];
+                              (static_cast<int>(A_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0] :
+                              A_data[m * K + k] * A_scale[0];
         float B_dequantized = has_zp ?
-            (B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+                              (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] :
+                              B_data[k * N + n] * B_scale[n];
 
         sum += A_dequantized * B_dequantized;
       }
@@ -258,8 +260,8 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
   std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
   test.AddInput<uint8_t>("A", {M, K}, A_data);
   test.AddInput<uint8_t>("B", {K, N}, B_data);
-  std::vector<uint8_t> A_zero_point = {3};
-  std::vector<uint8_t> B_zero_point = {5};
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<uint8_t> B_zero_point = {1};
 
   test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
   test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
@@ -363,7 +365,7 @@ TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) {
   test.AddInput<int8_t>("A", {M, K}, A_data);
   test.AddInput<uint8_t>("B", {K, N}, B_data);
   std::vector<int8_t> A_zero_point = {-1};
-  std::vector<uint8_t> B_zero_point = {3};
+  std::vector<uint8_t> B_zero_point = {1};
   std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
 
   test.AddInput<MLFloat16>("a_scale", {1}, A_scale);

From d3acbaca7d4673bcdbebc799c4d23d8572313dc1 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 8 Feb 2024 12:17:20 -0800
Subject: [PATCH 6/7] Avoid saturation for U8S8 CPU testcases

---
 .../matmul_integer_to_float_test.cc           | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index adb24dc75f375..dc92068134c67 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -79,10 +79,11 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
 
   std::vector<WType> B_data;
 
-  std::vector<WType> tmp_B_data = random.Uniform<WType>(B_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
-
+  std::vector<WType> tmp_B_data;
+  tmp_B_data = random.Uniform<WType>(B_dims,
+                                     (constexpr(std::is_same_v<WType, int8_t>)) ?
+                                     std::numeric_limits<int8_t>::lowest()/2 : std::numeric_limits<uint8_t>::lowest(),
+                                     std::numeric_limits<WType>::max() / 2);
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
     return static_cast<WType>(v);
   });
@@ -139,7 +140,8 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
   // Only DML EP supports these data type combinations for now
   if ((constexpr(std::is_same_v<OType, MLFloat16>)) ||
       (constexpr(std::is_same_v<OType, float>) &&
-       !constexpr(std::is_same_v<WType, IType>))) {
+       constexpr(std::is_same_v<IType, int8_t>) &&
+       constexpr(std::is_same_v<WType, uint8_t>))) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultDmlExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
@@ -211,7 +213,23 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>();
 }
 
-// DML EP supports Float16 output type and A Matrix and B Matric of different data types for Float32 output
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>();
+}
+
+// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
 #if defined(USE_DML)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
@@ -230,22 +248,6 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>();
-}
-
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>();
-}
-
-TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>();
-}
-
-TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>();
-}
-
 TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
   OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
   int64_t M = 5;

From 8925c191d90e3e570f5c6d9f6d9cff1a1c9f5b92 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 8 Feb 2024 17:36:43 -0800
Subject: [PATCH 7/7] move matmul_integer_to_float16_int8.onnx

---
 .../{ => transform/fusion}/matmul_integer_to_float16_int8.onnx    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename onnxruntime/test/testdata/{ => transform/fusion}/matmul_integer_to_float16_int8.onnx (100%)

diff --git a/onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
similarity index 100%
rename from onnxruntime/test/testdata/matmul_integer_to_float16_int8.onnx
rename to onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx