diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 83dfb245f263e..71e808160163d 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -826,14 +826,6 @@ class QLinearMatMulHelper : public MatMulHelperBase QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {} }; -class MatMulIntegerToFloatHelper : public MatMulHelperBase -{ -public: - template - MatMulIntegerToFloatHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 1) {} -}; - - class TopKHelper { void Initialize( diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 51d9a57b5e447..0d5dab35826c1 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -24,28 +24,66 @@ namespace onnxruntime { namespace test { template -void TestMatMulIntegerToFloat(const std::vector& A_dims, - std::vector B_dims, - const std::string& reference_model, - bool is_matrix_b_constant, +static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, + const std::vector& A_data, const std::vector& A_scale, + const std::vector& A_zero_point, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { + if (!per_column) { + B_zero_point.resize(N, B_zero_point[0]); + B_scale.resize(N, B_scale[0]); + } + + for (int64_t m = 0; m < M; m++) { + for (int64_t n = 0; n < N; n++) { + float sum = 0.0f; + for (int64_t k = 0; k < K; k++) { + float A_dequantized = has_zp ? + (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : + A_data[m * K + k] * A_scale[0]; + float B_dequantized = has_zp ? + (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : + B_data[k * N + n] * B_scale[n]; + + sum += A_dequantized * B_dequantized; + } + if (has_bias) { + sum += Bias[n]; + } + Y_data[m * N + n] = static_cast(sum); + } + } +} + +template +void TestMatMulIntegerToFloat(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, bool has_bias = false) { // create rand inputs RandomValueGenerator random{}; - + int64_t M = 4; + int64_t N = 128; + int64_t K = 128; + std::vector A_dims{M, K}; + std::vector B_dims{K, N}; + std::vector Y_dims{M, K}; std::vector A_data; - std::vector tmp_A_data = random.Uniform(A_dims, - std::numeric_limits::lowest(), - std::numeric_limits::max()); - std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> WType { + std::vector tmp_A_data = random.Uniform(A_dims, + std::numeric_limits::lowest(), + std::numeric_limits::max()); + std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType { return static_cast(v); }); std::vector B_data; - std::vector tmp_B_data = random.Uniform(B_dims, - std::numeric_limits::lowest(), - std::numeric_limits::max()); + + std::vector tmp_B_data; + tmp_B_data = random.Uniform(B_dims, + (constexpr(std::is_same_v)) ? + std::numeric_limits::lowest()/2 : std::numeric_limits::lowest(), + std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); }); @@ -60,9 +98,9 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](WType& zp) { - zp = static_cast(random.Uniform(std::array{1}, - std::numeric_limits::lowest(), - std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, + std::numeric_limits::lowest(), + std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -77,7 +115,7 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.AddInput("a_zero_point", {1}, A_zero_point); test.AddInput("b_zero_point", {b_scale_zp_size}, B_zero_point); } else { - test.AddOptionalInputEdge(); + test.AddOptionalInputEdge(); test.AddOptionalInputEdge(); } @@ -87,39 +125,39 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.AddOptionalInputEdge(); } - test.AddReferenceOutputs(reference_model); -#if defined(USE_DML) - if constexpr (std::is_same_v) { - test.SetOutputRelErr("Y", 2e-2f); + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + per_column, has_zp, has_bias); + + if (constexpr(std::is_same_v)) { + test.AddOutput("Y", {M, N}, Y_data); } else { - test.SetOutputRelErr("Y", 2.0f); + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + test.SetOutputAbsErr("Y", 0.5f); } -#else - test.SetOutputRelErr("Y", 1e-4f); -#endif - if constexpr (std::is_same_v) { - test.Run(); + // Only DML EP supports these data type combinations for now + if ((constexpr(std::is_same_v)) || + (constexpr(std::is_same_v) && + constexpr(std::is_same_v) && + constexpr(std::is_same_v))) { + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } else { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider}); + test.Run(); } } template -void RunMatMulIntegerToFloatTest(const string& model_path) { - std::vector A_dims{4, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{4, 128}; - +void RunMatMulIntegerToFloatTest() { TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + false, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( @@ -130,66 +168,274 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { false, /*per_column*/ HasZeroPoint, /*has_zp*/ HasBias /*has_bias*/ + true, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + false, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + true, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); } -#if USE_DML -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_bias.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8_bias.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8_bias.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -#endif // USE_DML -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_bias.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8_bias.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8_bias.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output +#if defined(USE_DML) + +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {1, 5, 2, 1, 9, + 1, 1, 3, 7, 2}; + std::vector B_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector A_scale = ToFloat16({3.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {1}; + std::vector B_zero_point = {1}; + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, {}, Y_data, + false, true, false); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector B_data = {2, -1, -9, 1, 1, + -1, 0, -3, 1, -4}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {1}; + std::vector B_zero_point = {3}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, {}, Y_data, + false, true, false); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, -2, 1, 1, + 2, -1, -9, 1, 1}; + std::vector B_data = {2, -1, -9, 1, 1, + -1, 0, -3, 1, -4}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {-1}; + std::vector B_zero_point = {3}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + false, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, -2, 1, 1, + 2, -1, -9, 1, 1}; + std::vector B_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {-1}; + std::vector B_zero_point = {1}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + false, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 2; + int64_t N = 2; + int64_t K = 3; + + std::vector A_data = {11, -2, 5, + -1, 3, 10}; + std::vector B_data = {-13, -2, + 9, 55, + -1, 23}; + std::vector A_scale = ToFloat16({0.910f}); + std::vector B_scale = ToFloat16({1.10f, 1.123f}); + + std::vector A_zero_point = {113}; + std::vector B_zero_point = {98, 71}; + + std::vector Bias = ToFloat16({0.10f, 1.123f}); + + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + + test.AddInput("a_scale", {}, {A_scale}); + test.AddInput("b_scale", {N}, B_scale); + test.AddInput("a_zero_point", {}, {A_zero_point}); + test.AddInput("b_zero_point", {N}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + true, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + test.SetOutputRelErr("Y", 2e-2f); + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } +#endif TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) { auto test_case = [&](const std::vector& input_shape, diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py index 206a8514253c5..37db93a288b08 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -65,13 +65,6 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia if __name__ == "__main__": GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float16_uint8_bias.onnx", sign_i=False, sign_w=False, output_type_fp16=True, has_zp=False, bias=True) - - GenerateModel("matmul_integer_to_float16_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_int8_int8_bias.onnx", sign_i=True, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False) GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False) GenerateModel("matmul_integer_to_float_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=False, has_zp=False, bias=True)