Skip to content

Commit

Permalink
Fix untyped float values in quantization tool missing from PR #18043 (#…
Browse files Browse the repository at this point in the history
…19182)

### Description
Extends the code coverage to Entroy, Histogram and Distribution
calibration method, fix bugs while doing it.



### Motivation and Context
Bugs detected in [Olive](https://github.com/microsoft/OLive).
  • Loading branch information
xadupre authored Jan 17, 2024
1 parent 9876cc7 commit 63dd605
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 23 deletions.
86 changes: 67 additions & 19 deletions onnxruntime/python/tools/quantization/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# license information.
# --------------------------------------------------------------------------
import abc
import copy
import itertools
import os
import uuid
Expand All @@ -21,6 +22,48 @@
from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution


def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
"""
See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
Python implementation.
"""
res = np.empty(pk.shape, dtype=pk.dtype)
res[:] = pk[:] * np.log(pk[:] / qk[:])
c2 = (pk == 0) & (qk >= 0)
res[c2] = 0
c1 = (pk > 0) & (qk > 0)
res[~c1] = np.inf
return res


def entropy(
pk: np.ndarray,
qk: np.ndarray,
base: Optional[float] = None,
axis: int = 0,
) -> np.ndarray:
"""
Simplifeied version of entropy.
Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
This avoids taking a dependency on scipy just for this function.
"""
assert base is None or base > 0, "base={base} must be a positive number or `None`."
assert qk is not None, "qk is None"

pk = np.asarray(pk).astype(np.float32)
pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)

qk = np.asarray(qk).astype(np.float32)
pk, qk = np.broadcast_arrays(pk, qk)
qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
vec = rel_entr(pk, qk)

s = np.sum(vec, axis=axis)
if base is not None:
s /= np.log(base)
return s.astype(pk.dtype)


class TensorData:
_allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
_floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
Expand Down Expand Up @@ -708,8 +751,8 @@ def collect_absolute_value(self, name_to_arr):
min_value = np.min(data_arr_np)
max_value = np.max(data_arr_np)
else:
min_value = 0
max_value = 0
min_value = np.array(0, dtype=data_arr_np.dtype)
max_value = np.array(0, dtype=data_arr_np.dtype)

data_arr_np = np.absolute(data_arr_np) # only consider absolute value

Expand All @@ -725,6 +768,8 @@ def collect_absolute_value(self, name_to_arr):
old_histogram = self.histogram_dict[tensor]
old_min = old_histogram[2]
old_max = old_histogram[3]
assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
old_hist = old_histogram[0]
old_hist_edges = old_histogram[1]
temp_amax = np.max(data_arr_np)
Expand Down Expand Up @@ -757,7 +802,7 @@ def collect_value(self, name_to_arr):
min_value = np.array(0, dtype=data_arr.dtype)
max_value = np.array(0, dtype=data_arr.dtype)

threshold = max(abs(min_value), abs(max_value))
threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)

if tensor in self.histogram_dict:
old_histogram = self.histogram_dict[tensor]
Expand Down Expand Up @@ -809,7 +854,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
def compute_collection_result(self):
if not self.histogram_dict or len(self.histogram_dict) == 0:
raise ValueError("Histogram has not been collected. Please run collect() first.")
print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")

if self.method == "entropy":
return self.compute_entropy()
Expand Down Expand Up @@ -938,7 +983,14 @@ def compute_distribution(self):
assert avg_coef.dtype != np.float64
assert std_coef.dtype != np.float64
assert hist_edges.dtype != np.float64
thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
thresholds_dict[tensor] = TensorData(
avg=avg_coef,
std=std_coef,
hist=hist,
hist_edges=hist_edges,
lowest=hist_edges.min(),
highest=hist_edges.max(),
)

# Plot histogram for debug only
if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
Expand All @@ -952,18 +1004,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
`q` is a truncated version of the original distribution.
Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
"""
import copy

from scipy.stats import entropy

hist = histogram[0]
hist_edges = histogram[1]
num_bins = hist.size
zero_bin_index = num_bins // 2
num_half_quantized_bin = num_quantized_bins // 2

dtype = histogram[1].dtype
kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
thresholds = [(0, 0) for i in range(kl_divergence.size)]
thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]

# <------------ num bins ---------------->
# <--- quantized bins ---->
Expand All @@ -983,10 +1032,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
start_index = zero_bin_index - i
end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins

thresholds[i - num_half_quantized_bin] = (
float(hist_edges[start_index]),
float(hist_edges[end_index]),
)
thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])

sliced_distribution = copy.deepcopy(hist[start_index:end_index])

Expand Down Expand Up @@ -1020,15 +1066,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):

norm = sum(nonzeros[start:end])
if norm != 0:
q[start:end] = float(quantized_bins[index]) / float(norm)
q[start:end] = quantized_bins[index] / norm

p = smooth_distribution(p)
q = smooth_distribution(q)

if isinstance(q, np.ndarray):
kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
if p is None or q is None:
div = np.array(np.inf, dtype=dtype)
else:
kl_divergence[i - num_half_quantized_bin] = float("inf")
div = np.array(entropy(p, q), dtype=dtype)
kl_divergence[i - num_half_quantized_bin] = div

min_kl_divergence_idx = np.argmin(kl_divergence)
optimal_threshold = thresholds[min_kl_divergence_idx]
Expand All @@ -1038,6 +1084,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
optimal_threshold = (min_value, optimal_threshold[1])
if optimal_threshold[1] > max_value:
optimal_threshold = (optimal_threshold[0], max_value)
assert hasattr(optimal_threshold[0], "dtype")
assert hasattr(optimal_threshold[1], "dtype")
return optimal_threshold


Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ def smooth_distribution(p, eps=0.0001):

if not n_nonzeros:
# raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
return -1
return None
eps1 = eps * float(n_zeros) / float(n_nonzeros)
assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
n_zeros,
Expand Down
66 changes: 63 additions & 3 deletions onnxruntime/test/python/quantization/test_op_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,39 @@
import numpy as np
import onnx
import packaging.version as pv
from numpy.testing import assert_almost_equal
from onnx import TensorProto, helper
from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type

from onnxruntime.capi.onnxruntime_pybind11_state import Fail
from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
from onnxruntime.quantization.calibrate import entropy


def skip_if_new_opset_exception_raised(func):
def wrapper(*args, **kwargs):
try:
func(*args, **kwargs)
except Fail as e:
if "is under development and support for this is limited" in str(e):
raise unittest.SkipTest(f"Skipped {func} due to opset under development.") # noqa: B904
raise

return wrapper


class TestOpMatMul(unittest.TestCase):
def test_entropy(self):
try:
from scipy.stats import entropy as scipy_entropy
except ImportError:
raise unittest.SkipTest("scipy not installed.") # noqa: B904
pk = (np.arange(10) - 5).astype(np.float32) / 10
qk = -(np.arange(10) - 5).astype(np.float32) / 10
ent = scipy_entropy(pk, qk)
get = entropy(pk, qk)
assert_almost_equal(ent, get)

def input_feeds(self, n, name2shape, dtype):
input_data_list = []
for _i in range(n):
Expand Down Expand Up @@ -324,10 +350,11 @@ def test_quantize_matmul_u8u8(self):
@unittest.skipIf(
pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
)
@skip_if_new_opset_exception_raised
def test_quantize_matmul_u8u8_f16(self):
self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)

def quantize_matmul_s8s8(self, tt, opset, ir_version):
def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
np.random.seed(1)
model_fp_path = "matmul_fp.onnx"
self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
Expand All @@ -341,13 +368,15 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
extra_options={"ActivationSymmetric": True},
calibrate_method=calibrate_method,
)
self.static_quant_test_qdq(
model_fp_path,
data_reader,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
extra_options={"ActivationSymmetric": True},
calibrate_method=calibrate_method,
)

# dynamic quantization doesn't support activation:int8
Expand All @@ -357,11 +386,42 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
def test_quantize_matmul_s8s8(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)

def test_quantize_matmul_s8s8_entropy(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)

def test_quantize_matmul_s8s8_percentile(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)

def test_quantize_matmul_s8s8_distribution(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)

@unittest.skipIf(
pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
)
@skip_if_new_opset_exception_raised
def test_quantize_matmul_s8s8_f16(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)

@unittest.skipIf(
pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
)
@skip_if_new_opset_exception_raised
def test_quantize_matmul_s8s8_f16_entropy(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)

@unittest.skipIf(
pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
)
@skip_if_new_opset_exception_raised
def test_quantize_matmul_s8s8_f16_percentile(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)

@unittest.skipIf(
pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
)
@skip_if_new_opset_exception_raised
def test_quantize_matmul_s8s8_f16_distribution(self):
self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)

def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
np.random.seed(1)
Expand Down

0 comments on commit 63dd605

Please sign in to comment.