Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into user/linneamay/resize-18
Browse files Browse the repository at this point in the history
  • Loading branch information
Linnea May committed Jan 8, 2024
2 parents 1408272 + 52e5601 commit 2b2dedb
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 35 deletions.
12 changes: 10 additions & 2 deletions onnxruntime/contrib_ops/rocm/fused_conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ struct FNVHash {
void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) {
int spatial_dim = 1;
#if ROCM_VERSION >= 50500
miopenGetConvolutionSpatialDim(cdesc, &spatial_dim);
MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim));
std::vector<int> pads{spatial_dim};
std::vector<int> strides{spatial_dim};
std::vector<int> dilations{spatial_dim};
miopenConvolutionMode_t mode;
MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode));
#else
// Previous versions of MIOpen doesn't provide API to probe the dimension of a
// miopenConvolutionDescriptor_t, so we have to guess.
Expand All @@ -100,11 +105,12 @@ struct FNVHash {
pads.resize(spatial_dim);
strides.resize(spatial_dim);
dilations.resize(spatial_dim);
#endif
(*this) << spatial_dim;
(*this) << pads;
(*this) << strides;
(*this) << dilations;
#endif
(*this) << mode;
}

private:
Expand Down Expand Up @@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv<T, false> {
auto ret = miopenCompileFusionPlan(handle, fusion->plan);
if (miopenStatusSuccess == ret) {
fusion->compiled_on.insert(handle);
} else {
return ret;
}
return miopenStatusSuccess;
}
Expand Down
37 changes: 31 additions & 6 deletions onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# license information.
# --------------------------------------------------------------------------

from __future__ import annotations

import argparse
import logging
import os
from typing import List, Tuple

import numpy as np
import numpy.typing as npt
Expand All @@ -26,16 +27,24 @@
class MatMul4BitsQuantizer:
"""Perform 4b quantization of constant MatMul weights"""

def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None):
def __init__(
self,
model: ModelProto,
block_size: int,
is_symmetric: bool,
accuracy_level: int | None = None,
nodes_to_exclude: list[str] | None = None,
):
if nodes_to_exclude is None:
nodes_to_exclude = []
self.model = ONNXModel(model)
self.block_size = block_size
self.is_symmetric = is_symmetric
self.accuracy_level = accuracy_level
self.nodes_to_exclude = set(nodes_to_exclude)

@staticmethod
def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
for gid in range(len(graph_path) - 1, -1, -1):
graph = graph_path[gid]
for tensor in graph.initializer:
Expand Down Expand Up @@ -66,7 +75,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:

return (packed, scales, zero_point)

def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
"""If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""

if node.op_type != "MatMul":
Expand Down Expand Up @@ -113,6 +122,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
kwargs["N"] = cols
kwargs["bits"] = 4
kwargs["block_size"] = self.block_size
if self.accuracy_level is not None:
kwargs["accuracy_level"] = self.accuracy_level

matmul_q4_node = onnx.helper.make_node(
"MatMulNBits",
Expand All @@ -127,7 +138,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])

return matmul_q4_node

def _process_subgraph(self, graph_stack: List[GraphProto]):
def _process_subgraph(self, graph_stack: list[GraphProto]):
new_nodes = []
graph = graph_stack[-1]

Expand Down Expand Up @@ -201,6 +212,14 @@ def parse_args():
type=bool,
help="Indicate whether to quantize the model symmetrically",
)
parser.add_argument(
"--accuracy_level",
required=False,
type=int,
help="Accuracy level of the 4-bit quantized MatMul computation. "
"Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
"(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
)
parser.add_argument("-v", "--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument(
Expand Down Expand Up @@ -228,6 +247,12 @@ def parse_args():
raise Exception(f"file {output_model_path} already exists")

model = onnx.load(input_model_path)
quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude)
quant = MatMul4BitsQuantizer(
model=model,
block_size=args.block_size,
is_symmetric=args.symmetric,
accuracy_level=args.accuracy_level,
nodes_to_exclude=args.nodes_to_exclude,
)
quant.process()
quant.model.save_model_to_file(output_model_path, True)
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import annotations

import argparse
import logging
import os
import shutil
from itertools import chain
from typing import List

import onnx
import torch
Expand All @@ -21,11 +22,12 @@
from onnxruntime import quantization as ort_quantization
from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer

torch_export_onnx_opset_version = 14
logger = logging.getLogger("")
init_dist()


def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
def get_model_dynamic_axes(input_names: list[str], output_names: list[str]):
dynamic_axes = {}
for name in input_names + output_names:
if name in input_names:
Expand All @@ -42,7 +44,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
return dynamic_axes


def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]):
def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]):
dynamic_axes = {}
for name in input_names + output_names:
if name in {"input_ids", "position_ids"}:
Expand All @@ -65,7 +67,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li
return dynamic_axes


def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]):
def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]):
dynamic_axes = {}
for name in input_names + output_names:
if name in {"input_ids", "position_ids"}:
Expand Down Expand Up @@ -229,7 +231,7 @@ def run_torchscript_separate_export(
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=13,
opset_version=torch_export_onnx_opset_version,
do_constant_folding=True,
verbose=args.verbose,
)
Expand Down Expand Up @@ -288,7 +290,7 @@ def run_torchscript_separate_export(
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=13,
opset_version=torch_export_onnx_opset_version,
do_constant_folding=True,
verbose=args.verbose,
)
Expand Down Expand Up @@ -368,7 +370,7 @@ def run_torchscript_merged_export(
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=13,
opset_version=torch_export_onnx_opset_version,
do_constant_folding=True,
verbose=args.verbose,
)
Expand Down Expand Up @@ -412,7 +414,7 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov


def convert_to_float16(
args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1
):
decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx")
decoder_with_past_model_fp16_path = os.path.join(
Expand Down Expand Up @@ -635,7 +637,7 @@ def get_args():
help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
)

blockwise_group = parser.add_argument_group("4-bit quantization")
blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)")

blockwise_group.add_argument(
"--block_size",
Expand All @@ -645,6 +647,15 @@ def get_args():
help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
)

blockwise_group.add_argument(
"--int4_accuracy_level",
required=False,
type=int,
help="Accuracy level of the 4-bit quantized MatMul computation. "
"Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
"(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
)

smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)")

smooth_quant_group.add_argument(
Expand Down Expand Up @@ -937,7 +948,13 @@ def main():
for fp_path, int4_path in zip(old_paths, new_paths):
if os.path.exists(fp_path):
model = onnx.load_model(fp_path, load_external_data=True)
quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[])
quant = MatMul4BitsQuantizer(
model=model,
block_size=args.block_size,
is_symmetric=True,
accuracy_level=args.int4_accuracy_level,
nodes_to_exclude=[],
)
quant.process()
quant.model.save_model_to_file(int4_path, use_external_data_format=True)
del model
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Tuple
from __future__ import annotations

import numpy as np
import torch
Expand Down Expand Up @@ -235,7 +235,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u


# Convert list of past_key_values to dict of past_key and past_value
def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]):
def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]):
past_kv = {}
for i, (past_k, past_v) in enumerate(past_key_values):
past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

import argparse
import logging
import os
import time
from typing import List

import numpy as np
import torch
Expand Down Expand Up @@ -139,7 +140,7 @@ def verify_parity(
return kv_cache_ortvalues


def get_args(argv: List[str]):
def get_args(argv: list[str]):
parser = argparse.ArgumentParser()

parser.add_argument(
Expand Down Expand Up @@ -232,7 +233,7 @@ def get_args(argv: List[str]):
return args


def main(argv: List[str] = []): # noqa: B006
def main(argv: list[str] = []): # noqa: B006
args = get_args(argv)
setup_logger(args.verbose)
logger.info(f"Arguments: {args}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@ parameters:
type: string
default: qnn-v2.17.0.231124_win

- name: ort_package_version
displayName: OnnxRuntime Nuget package version
type: string
default: 1.15.0

- name: build_config
displayName: Build Configuration
type: string
Expand Down Expand Up @@ -47,7 +42,7 @@ jobs:
buildArch: x64
setVcvars: true
ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'

steps:
- template: templates/set-version-number-variables-step.yml
Expand Down Expand Up @@ -90,7 +85,7 @@ jobs:
displayName: 'Generating nuspec for the native Nuget package x64'
inputs:
script: |
python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
nuget pack NativeNuget.nuspec
mkdir $(Build.ArtifactStagingDirectory)\x64
Expand Down Expand Up @@ -130,7 +125,7 @@ jobs:
displayName: 'Generate CMake Configuration for arm64'
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'

- task: VSBuild@1
displayName: 'Build onnxruntime arm64'
Expand Down Expand Up @@ -178,7 +173,7 @@ jobs:
displayName: 'Generating nuspec for the native Nuget package arm64'
inputs:
script: |
python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
nuget pack NativeNuget.nuspec
mkdir $(Build.ArtifactStagingDirectory)\arm64
Expand Down
5 changes: 2 additions & 3 deletions tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,14 @@ stages:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env_cuda.bat
buildArch: x64
additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: CUDA
WITH_CACHE: true
# Some unit tests crash on A10 GPUs. So this job still needs to use T4.
MachinePool: onnxruntime-Win2022-GPU-T4
MachinePool: onnxruntime-Win2022-GPU-A10
isTraining: true

- stage: dml
Expand Down

0 comments on commit 2b2dedb

Please sign in to comment.