Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QNN EP/Quantization] Add MinimumRealRange extra option to quantization script #18278

Merged
merged 15 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion onnxruntime/python/tools/quantization/onnx_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def __init__(
self.is_activation_symmetric = (
False if "ActivationSymmetric" not in self.extra_options else self.extra_options["ActivationSymmetric"]
)
self.min_real_range = self.extra_options.get("MinimumRealRange")

self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
Expand Down Expand Up @@ -998,6 +999,7 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
qType,
self.is_weight_symmetric,
self.reduce_range and reduce_range,
self.min_real_range,
)

if qType in {
Expand Down Expand Up @@ -1087,6 +1089,7 @@ def quantize_weight_per_channel(
self.is_weight_symmetric
or weight_qType in (onnx_proto.TensorProto.INT8, onnx_proto.TensorProto.FLOAT8E4M3FN),
self.reduce_range and reduce_range,
self.min_real_range,
)
rmin_list.append(rmin)
rmax_list.append(rmax)
Expand Down Expand Up @@ -1208,7 +1211,9 @@ def calculate_quantization_params(self):
rmin, rmax = td.range_value
qmin, qmax = get_qmin_qmax_for_qType(self.activation_qType, symmetric=self.is_activation_symmetric)

zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, self.is_activation_symmetric)
zero, scale = compute_scale_zp(
rmin, rmax, qmin, qmax, self.is_activation_symmetric, self.min_real_range
)
quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale)

return quantization_params
14 changes: 11 additions & 3 deletions onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
return arr_fp32.astype(dtype)


def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
"""Calculate the scale s and zero point z for the quantization relation
r = s(q-z), where r are the original values and q are the corresponding
quantized values.
Expand All @@ -199,6 +199,8 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
:parameter rmax: maximum value of r
:parameter qmin: minimum value representable by the target quantization data type
:parameter qmax: maximum value representable by the target quantization data type
:parameter symmetric: True if the floating-point range should be made symmetric. Defaults to False.
:parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
:return: zero and scale [z, s]

"""
Expand All @@ -211,6 +213,10 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
rmin = min(rmin, 0)
rmax = max(rmax, 0)

# Ensure a minimum float-point range if specified.
if min_real_range is not None:
rmax = max(rmax, rmin + min_real_range)

if symmetric:
absmax = max(abs(rmin), abs(rmax))
rmin = -absmax
Expand Down Expand Up @@ -254,11 +260,13 @@ def compute_scale_zp_float8(element_type, std):
return [zero, scale]


def quantize_data(data, qType, symmetric, reduce_range=False):
def quantize_data(data, qType, symmetric, reduce_range=False, min_real_range=None):
"""
:param data: data to quantize
:param qType: data type to quantize to. Supported types UINT8 and INT8
:param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
:parameter reduce_range: True if the quantization range should be reduced. Defaults to False.
:parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
:return: minimum, maximum, zero point, scale, and quantized weights

To pack weights, we compute a linear transformation
Expand Down Expand Up @@ -301,7 +309,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
if qType in (TensorProto.INT8, TensorProto.UINT8, TensorProto.INT16, TensorProto.UINT16):
if len(data):
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric)
zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
return rmin, rmax, zero_point, scale, quantized_data

Expand Down
6 changes: 6 additions & 0 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,12 @@ def quantize_static(
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
contrib op implementations. The contrib op implementations may support features not standardized
into the ONNX specification (e.g., 16-bit quantization types).
MinimumRealRange = float|None :
Default is None. If set to a floating-point value, the calculation of the quantization parameters
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
necessary for EPs like QNN that require a minimum floating-point range when determining
quantization parameters.
"""
if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
if calibrate_method != CalibrationMethod.Distribution:
Expand Down
106 changes: 106 additions & 0 deletions onnxruntime/test/python/quantization/test_minimum_real_range_option.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

import unittest

import numpy as np
import onnx
from onnx import TensorProto, helper, numpy_helper

from onnxruntime import quantization


class TestMinimumRealRangeOption(unittest.TestCase):
def setUp(self):
self.qdq_model_name = "model_qdq_u8.onnx"

# Set up activations/weights with zero value ranges (i.e., rmax - rmax == 0).
self.zero_range_activations = [
np.zeros([1, 2, 32, 32], dtype="float32"),
]

self.zero_range_weights = np.zeros([1, 2, 2, 2], dtype="float32")

def perform_quantization(self, activations, weight, min_real_range):
# One-layer convolution model to be quantized with uint8 activations and uint8 weights.
act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None])
wgt_init = numpy_helper.from_array(weight, "WGT")
conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"])
graph = helper.make_graph([conv_node], "test", [act], [res], initializer=[wgt_init])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)])
onnx.save(model, "model.onnx")

# Quantize model
class DummyDataReader(quantization.CalibrationDataReader):
def __init__(self):
self.iterator = ({"ACT": act} for act in activations)

def get_next(self):
return next(self.iterator, None)

quantization.quantize_static(
model_input="model.onnx",
model_output=self.qdq_model_name,
calibration_data_reader=DummyDataReader(),
quant_format=quantization.QuantFormat.QDQ,
activation_type=quantization.QuantType.QUInt8,
weight_type=quantization.QuantType.QUInt8,
op_types_to_quantize=["Conv"],
extra_options={"MinimumRealRange": min_real_range},
)

# Extract quantization parameters: scales and zero points for activations and weights.
model = onnx.load(self.qdq_model_name)
act_zp = next(init for init in model.graph.initializer if init.name == "ACT_zero_point").int32_data[0]
act_sc = next(init for init in model.graph.initializer if init.name == "ACT_scale").float_data[0]
wgt_zp = next(init for init in model.graph.initializer if init.name == "WGT_zero_point").int32_data[0]
wgt_sc = next(init for init in model.graph.initializer if init.name == "WGT_scale").float_data[0]

# Return quantization parameters
return act_zp, act_sc, wgt_zp, wgt_sc

def test_default(self):
"""
Test default behavior without specifying the MinimumRealRange option.
"""
act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
self.zero_range_activations,
self.zero_range_weights,
min_real_range=None, # default behavior
)

# No minimum real range is set. Expect default behavior (scale = 1.0, zp = 0)
self.assertEqual(act_zp, 0)
self.assertEqual(act_sc, 1.0)
self.assertEqual(wgt_zp, 0)
self.assertEqual(wgt_sc, 1.0)

def test_min_real_range(self):
"""
Test a MinimumRealRange value of 0.0001.
"""
min_real_range = 0.0001

act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
self.zero_range_activations,
self.zero_range_weights,
min_real_range=min_real_range,
)

expected_scale = np.float32(min_real_range / 255)

# Minimum floating-point range is set. Expect small scale values.
self.assertEqual(act_zp, 0)
self.assertEqual(act_sc, expected_scale)
self.assertEqual(wgt_zp, 0)
self.assertEqual(wgt_sc, expected_scale)


if __name__ == "__main__":
unittest.main()
12 changes: 12 additions & 0 deletions onnxruntime/test/python/quantization/test_quant_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ def test_compute_scale_zp(self):
self.assertEqual(compute_scale_zp(-tiny_float, tiny_float, 0, 255, symmetric=True), [0, 1.0])
self.assertEqual(compute_scale_zp(-tiny_float, 0.0, 0, 255, symmetric=False), [0, 1.0])

# Test enforcing a minimum floatint-point range.
self.assertEqual(compute_scale_zp(0.0, 0.0, 0, 255, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 255])
self.assertEqual(
compute_scale_zp(0.0, 0.0, -128, 127, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 255]
)
self.assertEqual(
compute_scale_zp(0.0, 0.0, 0, 65535, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 65535]
)
self.assertEqual(
compute_scale_zp(0.0, 0.0, -32768, 32767, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 65535]
)

def test_load_external_model(self):
input_name = "input"
output_name = "output"
Expand Down
Loading