From c8dc2f7fe5564b0dd1f08caed7950acb51b05b65 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Tue, 13 Feb 2024 18:30:57 -0800 Subject: [PATCH 1/2] DynamicQuantizeMatMul Test updates --- .../dynamic_quantize_matmul_test.cc | 225 +++++++++++++----- 1 file changed, 162 insertions(+), 63 deletions(-) diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index c70f659f1b645..d4aa731d32631 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -23,20 +23,97 @@ namespace onnxruntime { namespace test { template -void TestDynamicQuantizeMatMul(const std::vector& A_dims, - std::vector B_dims, - const std::string& reference_model, - bool is_matrix_b_constant, +static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K, + const std::vector& A_data, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { + // DynamicQuantize Matrix A + const uint32_t num_elements = M * K; + std::vector QuantA_data(num_elements); + std::vector A_scale; + std::vector A_zero_point; + + // + // Get max and min + // + + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + float qmax = static_cast(std::numeric_limits::max()); + float qmin = static_cast(std::numeric_limits::lowest()); + + + for (uint32_t i = 0; i < num_elements; ++i) { + max = std::max(A_data[i], max); + min = std::min(A_data[i], min); + } + + // + // Adjust the maximum and minimum to include zero + // + max = std::max(max, 0.0f); + min = std::min(min, 0.0f); + + // + // Compute quantization values and store in tensors + // + float scale = static_cast(max - min) / (qmax - qmin); + T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax)); + + A_scale.push_back(scale); + A_zero_point.push_back(zeroPoint); + for (uint32_t i = 0; i < num_elements; ++i) { + QuantA_data[i] = static_cast(std::round((A_data[i] / scale ) + zeroPoint)); + } + if (!per_column) { + B_zero_point.resize(N, B_zero_point[0]); + B_scale.resize(N, B_scale[0]); + } + + for (int64_t m = 0; m < M; m++) { + for (int64_t n = 0; n < N; n++) { + float sum = 0.0f; + for (int64_t k = 0; k < K; k++) { + float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0]; + + float B_dequantized = has_zp ? + (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : + //(B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : + B_data[k * N + n] * B_scale[n]; + + sum += A_dequantized * B_dequantized; + } + if (has_bias) { + sum += Bias[n]; + } + Y_data[m * N + n] = sum; + } + } +} + + +template +void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, bool has_bias = false) { // create rand inputs RandomValueGenerator random{}; + int64_t M = 4; + int64_t N = 128; + int64_t K = 128; + std::vector A_dims{M, K}; + std::vector B_dims{K, N}; + std::vector Y_dims{M, K}; std::vector A_data = random.Uniform(A_dims, -1.0f, 1.0f); - + //std::vector A_data = random.Uniform(A_dims, -10.0f, 10.0f); std::vector B_data; - std::vector tmp_B_data = random.Uniform(B_dims, std::numeric_limits::min(), std::numeric_limits::max()); + std::vector tmp_B_data = random.Uniform(B_dims, + (constexpr(std::is_same_v)) ? + std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), + std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T { return static_cast(v); }); @@ -47,7 +124,8 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](T& zp) { - zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, + std::numeric_limits::min(), std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -69,79 +147,100 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, test.AddOptionalInputEdge(); } - test.AddReferenceOutputs(reference_model); - test.Run(); + std::vector Y_data(M * N); + CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias); + test.AddOutput("Y", {M, N}, Y_data); + // Only DML EP supports these data type combinations for now + //if (constexpr(std::is_same_v) && + // constexpr(std::is_same_v))) { + // std::vector> execution_providers; + // execution_providers.push_back(DefaultDmlExecutionProvider()); + // test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + //} else { + test.SetOutputRelErr("Y", 0.5f); + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(); } -template -void RunDynamicQuantizeMatMulTest(const string& model_path) { - std::vector A_dims{4, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{4, 128}; - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ - ); +template +void RunDynamicQuantizeMatMulTest() { - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); + + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ + ); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); } -TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8.onnx"); +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); } -TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8_bias.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8_bias.onnx"); + +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); } -TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { - std::vector A_dims{0, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{0, 128}; +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - false /*is_matrix_b_constant*/); +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - true /*is_matrix_b_constant*/); +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); } + +//TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { +// std::vector A_dims{0, 128}; +// std::vector B_dims{128, 128}; +// std::vector Y_dims{0, 128}; +// +// TestDynamicQuantizeMatMul( +// +// , +// false /*is_matrix_b_constant*/); +// +// TestDynamicQuantizeMatMul( +// +// , +// true /*is_matrix_b_constant*/); +//} + TEST(DynamicQuantizeMatMul, B_PerColumn_ND) { auto test_case = [&](const std::vector& input_shape, const std::vector& weights_shape, From d92e58d607ffd36fcd46a12e2661772c1e1673ea Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Tue, 13 Feb 2024 21:58:26 -0800 Subject: [PATCH 2/2] UInt8_test_with_empty_input test updates --- .../dynamic_quantize_matmul_test.cc | 96 ++++++++----------- 1 file changed, 38 insertions(+), 58 deletions(-) diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index d4aa731d32631..c24ce078da978 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -24,47 +24,40 @@ namespace test { template static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K, - const std::vector& A_data, const std::vector& B_data, - std::vector& B_scale, std::vector& B_zero_point, - const std::vector& Bias, std::vector& Y_data, - bool per_column, bool has_zp, bool has_bias) { + const std::vector& A_data, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { // DynamicQuantize Matrix A const uint32_t num_elements = M * K; std::vector QuantA_data(num_elements); std::vector A_scale; std::vector A_zero_point; - // // Get max and min - // - float min = std::numeric_limits::max(); float max = std::numeric_limits::lowest(); float qmax = static_cast(std::numeric_limits::max()); float qmin = static_cast(std::numeric_limits::lowest()); - for (uint32_t i = 0; i < num_elements; ++i) { max = std::max(A_data[i], max); min = std::min(A_data[i], min); } - // // Adjust the maximum and minimum to include zero - // max = std::max(max, 0.0f); min = std::min(min, 0.0f); - // - // Compute quantization values and store in tensors - // float scale = static_cast(max - min) / (qmax - qmin); T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax)); A_scale.push_back(scale); A_zero_point.push_back(zeroPoint); + + // Matrix Multiplication for (uint32_t i = 0; i < num_elements; ++i) { - QuantA_data[i] = static_cast(std::round((A_data[i] / scale ) + zeroPoint)); + QuantA_data[i] = static_cast(std::round((A_data[i] / scale) + zeroPoint)); } if (!per_column) { B_zero_point.resize(N, B_zero_point[0]); @@ -75,11 +68,11 @@ static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, con for (int64_t n = 0; n < N; n++) { float sum = 0.0f; for (int64_t k = 0; k < K; k++) { - float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0]; + float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) + * A_scale[0]; - float B_dequantized = has_zp ? + float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : - //(B_data[k * N + n] - B_zero_point[n]) * B_scale[n] : B_data[k * N + n] * B_scale[n]; sum += A_dequantized * B_dequantized; @@ -92,27 +85,27 @@ static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, con } } - template void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, - bool has_bias = false) { + bool has_bias = false, + bool empty_input = false) { // create rand inputs RandomValueGenerator random{}; - int64_t M = 4; + int64_t M = empty_input ? 1 : 4; int64_t N = 128; int64_t K = 128; - std::vector A_dims{M, K}; + std::vector A_dims{empty_input ? 0 : M, K}; std::vector B_dims{K, N}; - std::vector Y_dims{M, K}; + std::vector Y_dims{empty_input ? 0 : M, K}; std::vector A_data = random.Uniform(A_dims, -1.0f, 1.0f); - //std::vector A_data = random.Uniform(A_dims, -10.0f, 10.0f); std::vector B_data; std::vector tmp_B_data = random.Uniform(B_dims, (constexpr(std::is_same_v)) ? - std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), + std::numeric_limits::lowest() / 2 : + std::numeric_limits::lowest(), std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T { return static_cast(v); @@ -125,7 +118,8 @@ void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, B_zero_point.end(), [&random](T& zp) { zp = static_cast(random.Uniform(std::array{1}, - std::numeric_limits::min(), std::numeric_limits::max())[0]); + std::numeric_limits::min(), + std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -148,24 +142,14 @@ void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, } std::vector Y_data(M * N); - CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias); - test.AddOutput("Y", {M, N}, Y_data); - // Only DML EP supports these data type combinations for now - //if (constexpr(std::is_same_v) && - // constexpr(std::is_same_v))) { - // std::vector> execution_providers; - // execution_providers.push_back(DefaultDmlExecutionProvider()); - // test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); - //} else { - test.SetOutputRelErr("Y", 0.5f); - std::vector> execution_providers; - execution_providers.push_back(DefaultDmlExecutionProvider()); - test.Run(); + CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data, + per_column, has_zp, has_bias); + test.AddOutput("Y", Y_dims, Y_data); + test.Run(); } template void RunDynamicQuantizeMatMulTest() { - TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ false, /*per_column*/ HasZeroPoint, /*has_zp*/ @@ -177,13 +161,13 @@ void RunDynamicQuantizeMatMulTest() { HasZeroPoint, /*has_zp*/ HasBias /*has_bias*/ ); - + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ true, /*per_column*/ HasZeroPoint, /*has_zp*/ HasBias /*has_bias*/ ); - + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ true, /*per_column*/ HasZeroPoint, /*has_zp*/ @@ -207,7 +191,6 @@ TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) { RunDynamicQuantizeMatMulTest(); } - TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) { RunDynamicQuantizeMatMulTest(); } @@ -224,22 +207,19 @@ TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) { RunDynamicQuantizeMatMulTest(); } - -//TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { -// std::vector A_dims{0, 128}; -// std::vector B_dims{128, 128}; -// std::vector Y_dims{0, 128}; -// -// TestDynamicQuantizeMatMul( -// -// , -// false /*is_matrix_b_constant*/); -// -// TestDynamicQuantizeMatMul( -// -// , -// true /*is_matrix_b_constant*/); -//} +TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { + std::vector A_dims{0, 2}; + std::vector B_dims{2, 2}; + std::vector Y_dims{0, 2}; + OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain); + test.AddInput("T1", A_dims, {}); + test.AddInput("T2", B_dims, {1, 6, 0, 8}); + test.AddInput("b_scale", {1}, {1.0f}); + test.AddInput("b_zero_point", {1}, {0}); + test.AddOptionalInputEdge(); + test.AddOutput("Y", {0, 2}, {}); + test.Run(); +} TEST(DynamicQuantizeMatMul, B_PerColumn_ND) { auto test_case = [&](const std::vector& input_shape,