diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index c70f659f1b645..c24ce078da978 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -23,20 +23,90 @@ namespace onnxruntime { namespace test { template -void TestDynamicQuantizeMatMul(const std::vector& A_dims, - std::vector B_dims, - const std::string& reference_model, - bool is_matrix_b_constant, +static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K, + const std::vector& A_data, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { + // DynamicQuantize Matrix A + const uint32_t num_elements = M * K; + std::vector QuantA_data(num_elements); + std::vector A_scale; + std::vector A_zero_point; + + // Get max and min + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + float qmax = static_cast(std::numeric_limits::max()); + float qmin = static_cast(std::numeric_limits::lowest()); + + for (uint32_t i = 0; i < num_elements; ++i) { + max = std::max(A_data[i], max); + min = std::min(A_data[i], min); + } + + // Adjust the maximum and minimum to include zero + max = std::max(max, 0.0f); + min = std::min(min, 0.0f); + + float scale = static_cast(max - min) / (qmax - qmin); + T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax)); + + A_scale.push_back(scale); + A_zero_point.push_back(zeroPoint); + + // Matrix Multiplication + for (uint32_t i = 0; i < num_elements; ++i) { + QuantA_data[i] = static_cast(std::round((A_data[i] / scale) + zeroPoint)); + } + if (!per_column) { + B_zero_point.resize(N, B_zero_point[0]); + B_scale.resize(N, B_scale[0]); + } + + for (int64_t m = 0; m < M; m++) { + for (int64_t n = 0; n < N; n++) { + float sum = 0.0f; + for (int64_t k = 0; k < K; k++) { + float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) + * A_scale[0]; + + float B_dequantized = has_zp ? + (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : + B_data[k * N + n] * B_scale[n]; + + sum += A_dequantized * B_dequantized; + } + if (has_bias) { + sum += Bias[n]; + } + Y_data[m * N + n] = sum; + } + } +} + +template +void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, - bool has_bias = false) { + bool has_bias = false, + bool empty_input = false) { // create rand inputs RandomValueGenerator random{}; + int64_t M = empty_input ? 1 : 4; + int64_t N = 128; + int64_t K = 128; + std::vector A_dims{empty_input ? 0 : M, K}; + std::vector B_dims{K, N}; + std::vector Y_dims{empty_input ? 0 : M, K}; std::vector A_data = random.Uniform(A_dims, -1.0f, 1.0f); - std::vector B_data; - std::vector tmp_B_data = random.Uniform(B_dims, std::numeric_limits::min(), std::numeric_limits::max()); + std::vector tmp_B_data = random.Uniform(B_dims, + (constexpr(std::is_same_v)) ? + std::numeric_limits::lowest() / 2 : + std::numeric_limits::lowest(), + std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T { return static_cast(v); }); @@ -47,7 +117,9 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](T& zp) { - zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, + std::numeric_limits::min(), + std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -69,77 +141,84 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, test.AddOptionalInputEdge(); } - test.AddReferenceOutputs(reference_model); + std::vector Y_data(M * N); + CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data, + per_column, has_zp, has_bias); + test.AddOutput("Y", Y_dims, Y_data); test.Run(); } -template -void RunDynamicQuantizeMatMulTest(const string& model_path) { - std::vector A_dims{4, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{4, 128}; - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ +template +void RunDynamicQuantizeMatMulTest() { + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); } -TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8.onnx"); +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); } -TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8_bias.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8_bias.onnx"); +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); } TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { - std::vector A_dims{0, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{0, 128}; - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - false /*is_matrix_b_constant*/); - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - true /*is_matrix_b_constant*/); + std::vector A_dims{0, 2}; + std::vector B_dims{2, 2}; + std::vector Y_dims{0, 2}; + OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain); + test.AddInput("T1", A_dims, {}); + test.AddInput("T2", B_dims, {1, 6, 0, 8}); + test.AddInput("b_scale", {1}, {1.0f}); + test.AddInput("b_zero_point", {1}, {0}); + test.AddOptionalInputEdge(); + test.AddOutput("Y", {0, 2}, {}); + test.Run(); } TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {