From 4aa84003cad57610d05603367261bce9d27cafe4 Mon Sep 17 00:00:00 2001 From: guyang3532 <62738430+guyang3532@users.noreply.github.com> Date: Wed, 27 Mar 2024 16:10:07 +0800 Subject: [PATCH 1/9] support Pow/Div/Sqrt in PaddingElimination (#20083) --- .../compute_optimizer/padding_elimination.cc | 11 ++++-- .../python/orttraining_test_ortmodule_api.py | 35 +++++++++++++------ 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc index d42af92c7c66d..1f65d886a4b8b 100644 --- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc +++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc @@ -224,8 +224,10 @@ void IterateSubgraphFromNode(Graph& graph, visited.insert(cur); if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Add", {7, 13, 14}) || graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "BiasGelu", {1}, kMSDomain) || - graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14}) || - graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14})) { + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Div", {7, 13, 14}) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14}) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Pow", {7, 12, 13, 15}) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14})) { ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end() || subgraph.find(cur->MutableInputDefs()[1]) != subgraph.end()); if (cur->InputDefs()[0]->Shape() && cur->InputDefs()[1]->Shape()) { @@ -278,7 +280,10 @@ void IterateSubgraphFromNode(Graph& graph, subgraph.insert(cur->MutableOutputDefs()[1]); PushAllOutputNode(graph, to_visit, cur, visited); } else if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Cast", {9, 13}) || - graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain)) { + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "FastGelu", {1}, kMSDomain) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "QuickGelu", {1}, kMSDomain) || + graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sqrt", {6, 13})) { ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end()); subgraph.insert(cur->MutableOutputDefs()[0]); PushAllOutputNode(graph, to_visit, cur, visited); diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index da217eb76949c..5078058995281 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -5725,8 +5725,6 @@ def run_step(model, input, target): @pytest.mark.parametrize("label_is_sparse", [False, True]) @pytest.mark.parametrize("rank", [1, 2]) def test_runtime_inspector_label_and_embed_sparsity_detection(embed_is_sparse, label_is_sparse, rank, caplog): - os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1" - class NeuralNetCrossEntropyLoss(torch.nn.Module): def __init__(self, num_embeddings, embedding_dim): super().__init__() @@ -5797,10 +5795,12 @@ def run_step(model, input, positions): "test_cases", [ ("Add", 0), + ("Add", 1), ("Add", 2), ("Add", 3), ("Add", 4), ("Sub", 0), + ("Sub", 1), ("Sub", 2), ("Sub", 3), ("Sub", 4), @@ -5808,12 +5808,22 @@ def run_step(model, input, positions): ("Mul", 2), ("Mul", 3), ("Mul", 4), + ("Div", 0), + ("Div", 2), + ("Div", 3), + ("Div", 4), + ("Pow", 0), + ("Pow", 1), + ("Pow", 2), + ("Pow", 3), + ("Pow", 4), ("MatMul", 0), ("MatMul", 1), ("Dropout", 0), ("LayerNormalization", 0), ("LayerNormalization", 1), ("Cast", 0), + ("Sqrt", 0), ("BiasGelu", 0), ("Gelu", 0), ("ReduceMean", 0), @@ -5821,7 +5831,6 @@ def run_step(model, input, positions): ], ) def test_ops_for_padding_elimination(test_cases): - os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1" test_op = test_cases[0] case = test_cases[1] @@ -5848,7 +5857,7 @@ def __init__(self, vocab_size, hidden_size, pad_token_id): # pattern should be insert to the arg of [batch_size, 1, hidden_size]. # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size], # the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad' - # pattern should be insert to the arg of [batch_size, 1, hidden_size]. + # pattern should be insert to the arg of [1, hidden_size]. # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size], # the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to # output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to @@ -5858,6 +5867,8 @@ def test_elementwise(self, input_ids): one_input = None if case == 0: one_input = torch.ones(self.hidden_size, dtype=torch.long).to(device) + elif case == 1: + one_input = 1 elif case == 2: one_input = torch.ones((input_shape[0], 1, self.hidden_size), dtype=torch.long).to(device) elif case == 3: @@ -5872,6 +5883,10 @@ def test_elementwise(self, input_ids): output = one_input - inputs_embeds elif test_op == "Mul": output = one_input * inputs_embeds + elif test_op == "Div": + output = inputs_embeds / one_input + elif test_op == "Pow": + output = inputs_embeds ** (one_input * 2) else: output = None return output @@ -5911,6 +5926,8 @@ def test_other(self, input_ids): output = torch.nn.functional.gelu(inputs_embeds + bias) elif test_op == "Gelu": output = torch.nn.functional.gelu(inputs_embeds) + elif test_op == "Sqrt": + output = torch.sqrt(inputs_embeds) elif test_op == "ReduceMean": # In case 0, the inputs_embeds are reduced at last dimension, the ReduceMean should be included in padding # elimination subgraph and the PadAndUnflatten should be added to output of ReduceMean. @@ -5924,7 +5941,7 @@ def test_other(self, input_ids): return output def forward(self, input_ids): - if test_op in ["Add", "Mul", "Sub"]: + if test_op in ["Add", "Mul", "Sub", "Div", "Pow"]: output = self.test_elementwise(input_ids) elif test_op == "MatMul": output = self.test_matmul(input_ids) @@ -5953,7 +5970,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): model(x) training_model = model._torch_module._execution_manager(True)._onnx_models.optimized_model - if test_op == "Sub": + if test_op == "Sub" or test_op == "Pow": assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 2 else: assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 1 @@ -5974,7 +5991,7 @@ def find_input_node_type(model, arg): return result[0].op_type if len(result) == 1 else None recover_pad_input_optypes = [find_input_node_type(training_model, arg) for arg in recover_pad_node.input] - if test_op == "Add" or test_op == "Mul" or test_op == "Sub": + if test_op == "Add" or test_op == "Mul" or test_op == "Sub" or test_op == "Div" or test_op == "Pow": assert test_op in recover_pad_input_optypes else: if case == 0: @@ -5982,11 +5999,8 @@ def find_input_node_type(model, arg): else: assert "ATen" in recover_pad_input_optypes - del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] - def test_e2e_padding_elimination(): - os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1" seed = 5033 random.seed(seed) np.random.seed(seed) @@ -6129,7 +6143,6 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node] assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node] - del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] @pytest.mark.skipif( From ca465dc087990682adc1db9eeaa0bcc4a1464287 Mon Sep 17 00:00:00 2001 From: Nanashi Date: Wed, 27 Mar 2024 18:07:00 +0900 Subject: [PATCH 2/9] [js] Make error friendly when isOrtFormat is undefined (#19958) ### Description Make error friendly when isOrtFormat is undefined (`onnxruntime.InferenceSession.create` is called with ArrayBuffer or Uint8Array). ### Motivation and Context I was trying to run my onnx model in WebGL EP, but it gave me the error "Cannot read properties of null (reading 'irVersion')". I used debugger to find that actual error is `int64 is not supported`, but the error was invisible for me. So I made it to show both error when isOrtFormat is undefined. I haven't written unit test yet, so I'm making it draft. (I have no idea about how do I test this though...) [d62d942](https://github.com/microsoft/onnxruntime/pull/19958/commits/d62d9425ba7b9e5ff0d0a2ae6998dd53817d5db9) --- js/web/lib/onnxjs/model.ts | 12 +++++++++++- js/web/test/e2e/browser-test-webgl.js | 13 +++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/js/web/lib/onnxjs/model.ts b/js/web/lib/onnxjs/model.ts index f9a1b6e76089d..8e689626011be 100644 --- a/js/web/lib/onnxjs/model.ts +++ b/js/web/lib/onnxjs/model.ts @@ -16,6 +16,7 @@ export class Model { constructor() {} load(buf: Uint8Array, graphInitializer?: Graph.Initializer, isOrtFormat?: boolean): void { + let onnxError: Error|undefined; if (!isOrtFormat) { // isOrtFormat === false || isOrtFormat === undefined try { @@ -25,10 +26,19 @@ export class Model { if (isOrtFormat !== undefined) { throw e; } + onnxError = e; } } - this.loadFromOrtFormat(buf, graphInitializer); + try { + this.loadFromOrtFormat(buf, graphInitializer); + } catch (e) { + if (isOrtFormat !== undefined) { + throw e; + } + // Tried both formats and failed (when isOrtFormat === undefined) + throw new Error(`Failed to load model as ONNX format: ${onnxError}\nas ORT format: ${e}`); + } } private loadFromOnnxFormat(buf: Uint8Array, graphInitializer?: Graph.Initializer): void { diff --git a/js/web/test/e2e/browser-test-webgl.js b/js/web/test/e2e/browser-test-webgl.js index e503f38ae5735..974c81d064c89 100644 --- a/js/web/test/e2e/browser-test-webgl.js +++ b/js/web/test/e2e/browser-test-webgl.js @@ -6,3 +6,16 @@ it('Browser E2E testing - WebGL backend', async function() { await testFunction(ort, {executionProviders: ['webgl']}); }); + +it('Browser E2E testing - invalid buffer', async () => { + try { + await ort.InferenceSession.create( + new Uint8Array(Array.from({length: 100}, () => 42)), {executionProviders: ['webgl']}); + + // Should not reach here. + assert(false); + } catch (e) { + assert(e.message.includes('as ONNX format')); + assert(e.message.includes('as ORT format')); + } +}); From 47903e701a4b20b8f43ba2c77392db8c31a3f664 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:35:43 -0700 Subject: [PATCH 3/9] fix condition in web CI YAML (#20095) ### Description fix condition in web CI YAML --- .../github/azure-pipelines/templates/win-web-ci.yml | 12 ++++++------ .../templates/win-web-multi-browsers.yml | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml index b7aee559cf73c..fa6103fb8a59d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml @@ -155,7 +155,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\01 npm test -- -e=chrome -b=webgl,wasm --user-data-dir=$(Agent.TempDirectory)\web\test\01 @@ -165,7 +165,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\02 npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\02 @@ -175,7 +175,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\03 npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\03 @@ -185,7 +185,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\04 npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\04 @@ -195,7 +195,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\05 npm test -- --webgl.pack -b=webgl -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\05 @@ -204,7 +204,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test\06 npm test -- --wasm.proxy -b=wasm -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\06 diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml index 00109b348e8cb..a0af221607dc8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml @@ -70,7 +70,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine" displayName: 'Check active Chrome processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test_multi_browsers\01 npm test -- suite0 -e=chrome -b=wasm,webgl --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\01 @@ -79,7 +79,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'firefox.exe'\" | Format-List CommandLine" displayName: 'Check active Firefox processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test_multi_browsers\02 npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\02 @@ -88,7 +88,7 @@ jobs: - script: | powershell "Get-WmiObject Win32_Process -Filter \"name = 'msedge.exe'\" | Format-List CommandLine" displayName: 'Check active Edge processes (before test)' - condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true')) + condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true')) - script: | mkdir $(Agent.TempDirectory)\web\test_multi_browsers\03 npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\03 From c8676ffbff5218e226a25651b5b0c981dc5da798 Mon Sep 17 00:00:00 2001 From: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:40:08 -0700 Subject: [PATCH 4/9] Add ModelProto support for quantize api (#20018) ### Description Add ModelProto support for `quantize` api ### Motivation and Context Currently, the `quantize` API only accepts a model path as the input model. However, for large models, saving and loading from disk can be time-consuming. By adding `ModelProto` as an input option to the `quantize` API, significant time can be saved. --- .../execution_providers/qnn/preprocess.py | 8 +-- .../execution_providers/qnn/quant_config.py | 8 ++- .../python/tools/quantization/quantize.py | 45 +++++++++++---- .../tools/quantization/shape_inference.py | 55 ++++++++++++++----- 4 files changed, 83 insertions(+), 33 deletions(-) diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py index e584a65574520..85f5d967f9ee3 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py @@ -16,8 +16,8 @@ def qnn_preprocess_model( - model_input: Path, - model_output: Path, + model_input: str | Path | onnx.ModelProto, + model_output: str | Path, fuse_layernorm: bool = False, save_as_external_data: bool = False, all_tensors_to_one_file: bool = False, @@ -37,7 +37,7 @@ def qnn_preprocess_model( - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node. Args: - model_input: Path to the input model file. + model_input: Path to the input model file or ModelProto. model_output: Path the output model file, which is only created if this method returns True. fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes. Defaults to False. @@ -82,7 +82,7 @@ def qnn_preprocess_model( to cancel out. """ modified = False - model = onnx.load_model(model_input) + model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input) onnx_model = ONNXModel(model) # Fuse Erf sequence into a single Gelu diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py index 479eaf5b0c542..3a217fdfaaffd 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py @@ -39,7 +39,7 @@ def warn_unable_to_override( def get_qnn_qdq_config( - model_input: Path, + model_input: str | Path | onnx.ModelProto, calibration_data_reader: CalibrationDataReader, calibrate_method=CalibrationMethod.MinMax, activation_type=QuantType.QUInt8, @@ -56,7 +56,11 @@ def get_qnn_qdq_config( if weight_symmetric is None: weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16} - model = onnx.load_model(model_input, load_external_data=False) + model = ( + model_input + if isinstance(model_input, onnx.ModelProto) + else onnx.load_model(model_input, load_external_data=False) + ) op_types = set() model_has_external_data = False diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 9b0c15e4b4dde..9ebd7bf3c408a 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -6,6 +6,9 @@ import logging import tempfile from pathlib import Path +from typing import Union + +import onnx from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator from .onnx_quantizer import ONNXQuantizer @@ -16,6 +19,7 @@ QuantType, load_model_with_shape_infer, model_has_pre_process_metadata, + save_and_reload_model_with_shape_infer, ) from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry @@ -280,8 +284,8 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua def quantize_static( - model_input, - model_output, + model_input: Union[str, Path, onnx.ModelProto], + model_output: Union[str, Path], calibration_data_reader: CalibrationDataReader, quant_format=QuantFormat.QDQ, op_types_to_quantize=None, @@ -304,7 +308,7 @@ def quantize_static( Args: - model_input: file path of model to quantize + model_input: file path of model or ModelProto to quantize model_output: file path of quantized model calibration_data_reader: a calibration data reader. It enumerates calibration data and generates inputs for the @@ -435,7 +439,11 @@ def quantize_static( qdq_ops = list(QDQRegistry.keys()) op_types_to_quantize = list(set(q_linear_ops + qdq_ops)) - model = load_model_with_shape_infer(Path(model_input)) + model = ( + save_and_reload_model_with_shape_infer(model_input) + if isinstance(model_input, onnx.ModelProto) + else load_model_with_shape_infer(Path(model_input)) + ) pre_processed: bool = model_has_pre_process_metadata(model) if not pre_processed: @@ -485,6 +493,15 @@ def inc_dataloader(): model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir: + if isinstance(model_input, onnx.ModelProto): + output_path = str(Path(quant_tmp_dir) / "model_input.onnx") + onnx.save_model( + model_input, + output_path, + save_as_external_data=True, + ) + model_input = output_path + calibrator = create_calibrator( Path(model_input), op_types_to_quantize, @@ -546,8 +563,8 @@ def inc_dataloader(): def quantize_dynamic( - model_input: Path, - model_output: Path, + model_input: Union[str, Path, onnx.ModelProto], + model_output: Union[str, Path], op_types_to_quantize=None, per_channel=False, reduce_range=False, @@ -560,7 +577,7 @@ def quantize_dynamic( """Given an onnx model, create a quantized onnx model and save it into a file Args: - model_input: file path of model to quantize + model_input: file path of model or ModelProto to quantize model_output: file path of quantized model op_types_to_quantize: specify the types of operators to quantize, like ['Conv'] to quantize Conv only. @@ -609,7 +626,11 @@ def quantize_dynamic( if not op_types_to_quantize or len(op_types_to_quantize) == 0: op_types_to_quantize = list(IntegerOpsRegistry.keys()) - model = load_model_with_shape_infer(Path(model_input)) + model = ( + save_and_reload_model_with_shape_infer(model_input) + if isinstance(model_input, onnx.ModelProto) + else load_model_with_shape_infer(Path(model_input)) + ) pre_processed: bool = model_has_pre_process_metadata(model) if not pre_processed: @@ -642,15 +663,15 @@ def quantize_dynamic( def quantize( - model_input: Path, - model_output: Path, + model_input: Union[str, Path, onnx.ModelProto], + model_output: Union[str, Path], quant_config: QuantConfig, ): """Quantize a model with QuantConfig. Args: - model_input (Path): Path to the model to quantize. - model_output (Path): Path to save the quantized model. + model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize. + model_output (str | Path): Path to save the quantized model. quant_config (QuantConfig): Quantization Configuration. """ diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py index b7d4726610387..7368304837a96 100644 --- a/onnxruntime/python/tools/quantization/shape_inference.py +++ b/onnxruntime/python/tools/quantization/shape_inference.py @@ -9,12 +9,13 @@ import tempfile import traceback from pathlib import Path -from typing import Optional +from typing import Optional, Union import onnx import onnxruntime from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference +from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data from .quant_utils import add_pre_process_metadata @@ -22,8 +23,8 @@ def quant_pre_process( - input_model_path: str, - output_model_path: str, + input_model: Union[str, Path, onnx.ModelProto], + output_model_path: Union[str, Path], skip_optimization: bool = False, skip_onnx_shape: bool = False, skip_symbolic_shape: bool = False, @@ -39,7 +40,7 @@ def quant_pre_process( """Shape inference and model optimization, in preparation for quantization. Args: - input_model_path: Path to the input model file") + input_model: Path to the input model file or ModelProto output_model_path: Path to the output model file skip_optimization: Skip model optimization step if true. This may result in ONNX shape inference failure for some models. @@ -68,8 +69,9 @@ def quant_pre_process( if not skip_symbolic_shape: logger.info("Performing symbolic shape inference...") + loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model) model = SymbolicShapeInference.infer_shapes( - onnx.load(input_model_path), + loaded_model, int_max, auto_merge, guess_output_rank, @@ -80,18 +82,18 @@ def quant_pre_process( # Use ORT optimizers (native code) to optimize model if not skip_symbolic_shape: # Need to save the inferenced model to file so as to run the optimizer - input_model_path = str(temp_path / "symbolic_shape_inferred.onnx") + input_model = str(temp_path / "symbolic_shape_inferred.onnx") if save_as_external_data: onnx.save_model( model, - input_model_path, + input_model, save_as_external_data=True, all_tensors_to_one_file=all_tensors_to_one_file, size_threshold=external_data_size_threshold, convert_attribute=False, ) else: - onnx.save(model, input_model_path) + onnx.save(model, input_model) model = None opt_model_path = str(temp_path / "optimized.onnx") @@ -99,7 +101,19 @@ def quant_pre_process( sess_option = onnxruntime.SessionOptions() sess_option.optimized_model_filepath = opt_model_path sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC - sess = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"]) + # For large model, extract external data from model and add to session options + if isinstance(input_model, onnx.ModelProto): + if has_external_data(input_model): + raise ValueError( + "ModelProto has external data not loaded into memory, ORT cannot create session. " + "Please load external data before calling this function. " + "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information." + ) + external_names, external_values = extract_raw_data_from_model(input_model) + sess_option.add_external_initializers(list(external_names), list(external_values)) + input_model = input_model.SerializeToString() + + sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"]) # Close the session to avoid the cleanup error on Windows for temp folders # https://github.com/microsoft/onnxruntime/issues/17627 del sess @@ -109,7 +123,7 @@ def quant_pre_process( ) logger.error(traceback.format_exc()) - input_model_path = opt_model_path + input_model = opt_model_path if not skip_onnx_shape: # ONNX shape inference. @@ -117,26 +131,37 @@ def quant_pre_process( # If the skip optimization is specified, we could be dealing with a # large model. So be on the safe side, save the model if model is not None: - input_model_path = str(temp_path / "symbolic_shape_inferred.onnx") + input_model = str(temp_path / "symbolic_shape_inferred.onnx") if save_as_external_data: onnx.save_model( model, - input_model_path, + input_model, save_as_external_data=True, all_tensors_to_one_file=all_tensors_to_one_file, size_threshold=external_data_size_threshold, convert_attribute=False, ) else: - onnx.save(model, input_model_path) + onnx.save(model, input_model) model = None + if isinstance(input_model, onnx.ModelProto): + input_model = str(Path(quant_tmp_dir) / "model_input.onnx") + onnx.save_model( + model, + input_model, + save_as_external_data=True, + all_tensors_to_one_file=all_tensors_to_one_file, + size_threshold=external_data_size_threshold, + convert_attribute=False, + ) + inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx") - onnx.shape_inference.infer_shapes_path(input_model_path, inferred_model_path) + onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path) model = onnx.load(inferred_model_path) if model is None: - model = onnx.load(input_model_path) + model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model) add_pre_process_metadata(model) From 4df9d16f98cd5ac60e1c6151207dfbf9f5a165a4 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Thu, 28 Mar 2024 03:20:57 +0800 Subject: [PATCH 5/9] [Fix] TSAUpload task must be in building stage (#20098) ### Description In #20085, TSAUpload was in testing stage so main branch failed. --- .../github/azure-pipelines/templates/py-win-gpu.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml index e200fb9e93bee..59387a0de4cd1 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml @@ -204,6 +204,17 @@ stages: inputs: AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll' + - task: TSAUpload@2 + displayName: 'TSA upload' + condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) + inputs: + GdnPublishTsaOnboard: false + GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' + + - template: component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build jobs: From ab2eaedfaa1e1abe5c128cfbccf8c6a45b75480e Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Thu, 28 Mar 2024 03:29:34 +0800 Subject: [PATCH 6/9] Install ONNX by buildling source code in Windows DML stage (#20079) ### Description In #20073, I use pin onnx version to unblock the whole PR CI. In fact, we could use the onnx that installed by building source code, that the onnx version is controlled by deps.txt. For some history reason, DML stage installed onnx from pypi. Now, the onnx can be installed as other stages. add an option to skip installing onnx in win-ci-prebuild-step --- tools/ci_build/build.py | 7 ------- .../templates/jobs/win-ci-prebuild-steps.yml | 5 +++++ tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml | 5 ++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index e1649ae251d88..7dfdbc301622a 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2083,13 +2083,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): # For CUDA or DML enabled builds test IOBinding feature if args.use_cuda or args.use_dml: log.info("Testing IOBinding feature") - if args.use_dml: - run_subprocess( - [sys.executable, "-m", "pip", "uninstall", "--yes", "onnx"], cwd=cwd, dll_path=dll_path - ) - run_subprocess( - [sys.executable, "-m", "pip", "install", "-q", "onnx==1.15.0"], cwd=cwd, dll_path=dll_path - ) run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path) if args.use_cuda: diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml index 864513bc4d671..67a2543bfb50e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml @@ -13,6 +13,10 @@ parameters: type: boolean default: false +- name: InstallONNX + type: boolean + default: true + - name: WITHCACHE type: boolean default: false @@ -106,6 +110,7 @@ steps: displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc +- ${{ if eq(parameters.InstallONNX, true) }}: - ${{ if eq(parameters.WITHCACHE, true) }}: - task: Cache@2 # machinepool is used to ensure the compiler is same diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml index 53eea1d69fb0e..c333c7ef084d0 100644 --- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml @@ -68,6 +68,7 @@ stages: BuildConfig: Debug MachinePool: 'onnxruntime-Win-CPU-2022' WithCache: false + InstallONNX: false Today: $(TODAY) - task: PythonScript@0 @@ -155,7 +156,7 @@ stages: GenerateDocumentation: false WITH_CACHE: false MachinePool: 'onnxruntime-Win-CPU-2022' - + - stage: x86_release dependsOn: [] jobs: @@ -256,5 +257,3 @@ stages: GenerateDocumentation: false WITH_CACHE: false MachinePool: 'onnxruntime-Win-CPU-2022' - - From b95fd4e644775a4343c13435bd729bd64f411752 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 27 Mar 2024 13:32:36 -0700 Subject: [PATCH 7/9] Enable CUDA EP unit testing on Windows (#20039) ### Description Address build issues and source code discrepancies. Fix cuda_test_provider gtest argument stack corruption. ### Motivation and Context `OpTester` class that is widely used for kernel testing is not suitable for testing internal classes for EPs that are built as shared objects. Currently, CUDA EP tests run only on Linux. We want to enable testing and developments on Windows, and create a usable pattern for testing of other EPs internals. Alternatives considered: Abstracting EP unit tests into separate test executable such as `onnxruntime_test_all`. This alternative was rejected as it would create a lot more changes in the established patterns, and potentially interfere with CUDA functionality with more complex source code maintanence. --- cmake/CMakeLists.txt | 2 +- cmake/onnxruntime_providers_cuda.cmake | 2 +- cmake/onnxruntime_unittests.cmake | 7 + .../core/framework/execution_provider.h | 2 +- .../onnxruntime/core/framework/run_options.h | 2 +- .../core/mickey/blk_q4/f16_prepack_sm80.h | 16 +- .../threadblock/quantb_mma_multistage.h | 2 +- .../quantb_meta_mma_tensor_op_tile_iterator.h | 25 ++- .../shared_library/provider_wrappedtypes.h | 15 +- onnxruntime/core/util/matrix_layout.h | 2 +- .../test/cuda_host/blkq4_fp16_quant_sm80.h | 7 +- .../cuda/test_cases/beam_search_topk.cc | 3 +- .../cuda/test_cases/blkq4_fp16_gemm_sm80.h | 22 ++- .../test_cases/blkq4_fp16_gemm_sm80_test.cc | 16 +- .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 187 +++++++++--------- .../cuda_execution_provider_test.cc | 11 +- .../cuda/test_cases/cuda_test_provider.cc | 32 ++- .../cuda/test_cases/cuda_utils_test.cc | 7 +- .../cuda/test_cases/gemm_options_test.cc | 4 +- .../test_cases/reduction_functions_test.cc | 4 +- .../azure-pipelines/linux-gpu-ci-pipeline.yml | 4 +- .../azure-pipelines/win-gpu-ci-pipeline.yml | 18 +- 22 files changed, 216 insertions(+), 174 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index ee1959bb357fe..3293506141689 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead. -cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF) +cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF) option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index aeeac10ead27d..1346a9ce968c6 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -122,7 +122,7 @@ endif() if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and - # add to the lib onnxruntime_providers_cuda separatedly. + # added to the lib onnxruntime_providers_cuda separately. # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc. set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc) list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src}) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 1ffb838328643..4a351dcf90d45 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock) target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey) target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common) + if (MSVC) + # Cutlass code has an issue with the following: + # warning C4100: 'magic': unreferenced formal parameter + target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$:SHELL:--compiler-options /wd4100>" + "$<$>:/wd4100>") + endif() + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut) endif() diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 40ca96a19aef1..16ad943a5f47e 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -53,7 +53,7 @@ struct NodeComputeInfo { DestroyFunctionStateFunc release_state_func; }; -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; enum class DataLayout { NCHW, diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index 5444c825d7991..789c3b13f2c3e 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -45,5 +45,5 @@ struct OrtRunOptions { }; namespace onnxruntime { -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; } // namespace onnxruntime diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h index a08cfb97eed4a..c81b4967d2719 100644 --- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h +++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h @@ -110,8 +110,8 @@ struct BlockwiseQuantization { static void prepack_weights( int rows, int columns, - const gsl::span& weights, // <- int4 weights, column major - const gsl::span& weights_prepacked // <- int4 prepacked weights tensor, same size buffer + gsl::span weights, // <- int4 weights, column major + gsl::span weights_prepacked // <- int4 prepacked weights tensor, same size buffer ) { ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 && (rows % QuantBlocking::kRow) == 0 && @@ -171,10 +171,10 @@ struct BlockwiseQuantization { static void prepack_quant_scales( size_t rows, size_t columns, - const gsl::span& scales, // <- quant scales, column major layout - const gsl::span& scales_prepacked // <- quant scales prepacked, same size buffer + gsl::span scales, // <- quant scales, column major layout + gsl::span scales_prepacked // <- quant scales prepacked, same size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE(scales.size() == size_t(meta_shape.product()), "Quantization scale tensor shape mismatch!"); ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()), @@ -241,10 +241,10 @@ struct BlockwiseQuantization { static void prepack_quant_offsets( size_t rows, size_t columns, - const gsl::span& offsets, // <- quant offsets, int4, column major layout - const gsl::span& offsets_prepacked // <- quant offsets prepacked, double size buffer + gsl::span offsets, // <- quant offsets, int4, column major layout + gsl::span offsets_prepacked // <- quant offsets prepacked, double size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0, "Does not support odd number of rows or columns!"); diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index 8b6bac8c5099a..28364cc34f2d7 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -132,7 +132,7 @@ struct DummyType{ } CUTLASS_HOST_DEVICE - std::monostate& operator[](int idx) { + std::monostate& operator[](int /*idx */) { return dummy_; } }; diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h index 4ba39dda3db8d..26239161cf8a3 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h @@ -437,7 +437,7 @@ class QuantBMetaMmaTensorOpTileIterator const &weights, Array& dest){ static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm."); @@ -453,19 +453,18 @@ class QuantBMetaMmaTensorOpTileIterator(dest.data()); const b64* scales_ptr = reinterpret_cast(scales.data()); - const ElementOffset* offsets_ptr = nullptr; - if constexpr(kHasOffset) { offsets_ptr = offsets.data(); } + [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr; + if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); } CUTLASS_PRAGMA_UNROLL for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){ // dequantize: d = scale * (weight - offset) // to use FMA, d = scale * weight + (scale * (-offset)) - b64 offsets; - if constexpr(kHasOffset){ - const uint32_t* p = reinterpret_cast(offsets_ptr); - + [[maybe_unused]] b64 offsets{0}; + if constexpr(kHasOffset) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* p = reinterpret_cast(fragment_offsets_ptr); asm volatile( "{\n\t" " .reg .b32 rb0, rb1;\n" // b32 regs for fp16x2 mul operands @@ -486,7 +485,7 @@ class QuantBMetaMmaTensorOpTileIterator= 800)) asm volatile( @@ -541,7 +540,7 @@ class QuantBMetaMmaTensorOpTileIterator(-16 - int(offsets[idx])); + offset = s * static_cast(-16 - static_cast(fragment_offsets[idx])); } else { offset = s * static_cast(-16-8); } @@ -795,13 +794,13 @@ class QuantBMetaMmaTensorOpTileIterator(scales.data()); - uint32_t* addon_ptr = reinterpret_cast(addon); - if constexpr (kHasOffset){ +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* scales_ptr = reinterpret_cast(scales.data()); + uint32_t* addon_ptr = reinterpret_cast(addon); // possible buffer over read 2 bytes here. const uint32_t* p = reinterpret_cast(offsets.data()); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + asm volatile( "{\n\t" " .reg .b32 rb0, rb1, rb2;\n" diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index bdad18c7edec0..3bb938c1a3197 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -394,14 +394,6 @@ struct ConfigOptions final { PROVIDER_DISALLOW_ALL(ConfigOptions) }; -struct OrtRunOptions final { - const ConfigOptions& GetConfigOptions() const { - return g_host->RunOptions__GetConfigOptions(this); - } - - PROVIDER_DISALLOW_ALL(OrtRunOptions) -}; - struct ComputeCapability final { static std::unique_ptr Create(std::unique_ptr t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); } static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast(p)); } @@ -1283,3 +1275,10 @@ template <> inline gsl::span Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); } } // namespace onnxruntime + +struct OrtRunOptions final { + const onnxruntime::ConfigOptions& GetConfigOptions() const { + return onnxruntime::g_host->RunOptions__GetConfigOptions(this); + } + PROVIDER_DISALLOW_ALL(OrtRunOptions) +}; diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h index 783a29d8a2055..43843da3fb96e 100644 --- a/onnxruntime/core/util/matrix_layout.h +++ b/onnxruntime/core/util/matrix_layout.h @@ -378,7 +378,7 @@ class MatrixRef { MatrixRef( NonConstMatrixRef const& ref, ///< MatrixRef to non-const data /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const - _Magic magic = (typename std::enable_if::type)0 + [[maybe_unused]] _Magic magic = (typename std::enable_if::type)0 ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {} ORT_FORCEINLINE diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h index 6ea8b55505214..942b1c4d2c2ad 100644 --- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h +++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h @@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref( int columns, const MatrixRef& tensor_scale, const MatrixRef& tensor_scale_prepacked) { - ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn), + ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == + (columns / QuantBlocking::kColumn), "Unexpected tensor_scale shape! Expected: (", rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")"); ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape()); @@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref( // 2 B operand tiles per mma instruction stacked on k dimension // (1,n) quantization blocking if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) { - ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values."); + ORT_THROW( + "sm80_prepack_quant_scales_ref should only be called for " + " row-wise block quantization on 16b float values."); } // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc index 9fecec9f7e8bb..a0d115c41c14b 100644 --- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc +++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc @@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) { std::vector top_k_values_ref(batch_size * k); std::vector top_k_tokens_ref(batch_size * k); std::vector top_k_indices_ref(batch_size * k); - ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k); + ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, + beam_size, vocab_size, k); const int32_t max_vocab_parts = 128; size_t buffer_size = batch_x_beam_x_vocab * 4 // input diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h index bbe370675fc48..f0dfaf1a58612 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h @@ -14,12 +14,14 @@ #pragma once +#include "test/cuda_host/blkq4_fp16_quant_sm80.h" + #include +#include -#include "core/util/matrix_layout.h" #include "core/common/common.h" #include "core/mickey/blk_q4/f16_prepack_sm80.h" -#include "test/cuda_host/blkq4_fp16_quant_sm80.h" +#include "core/util/matrix_layout.h" namespace onnxruntime { namespace cuda { @@ -48,10 +50,10 @@ Status sm80_supported(); template inline void blkq4_weights_gen( int rows, int columns, - std::vector& dequants, - std::vector& q_weights, - std::vector& q_scales, - std::vector& q_zp) { + thrust::host_vector& dequants, + thrust::host_vector& q_weights, + thrust::host_vector& q_scales, + thrust::host_vector& q_zp) { using Base = onnxruntime::cuda::BlockwiseQuantization< ElementT, block_size, @@ -74,7 +76,7 @@ inline void blkq4_weights_gen( const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns); const auto meta_shape = Base::get_quant_meta_shape(rows, columns); - const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); + [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); // // For testing quantization and dequantization, it is not straight @@ -120,9 +122,9 @@ inline void blkq4_weights_gen( q_scales.resize(meta_shape.product()); for (size_t i = 0; i < q_scales.size(); i++) { - uint32_t v = dis(gen); - uint32_t m = (v % 63) + 1; - uint32_t e = (v >> 6) % 4; + uint32_t vl = dis(gen); + uint32_t m = (vl % 63) + 1; + uint32_t e = (vl >> 6) % 4; q_scales[i] = ElementT(m / static_cast(1 << (2 + e))); } MatrixRef tensor_scale( diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index e687ae73e66f2..e7fa0dae02fda 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -11,15 +11,15 @@ * well with CUTLASS headers. */ +#include "blkq4_fp16_gemm_sm80.h" + +#include "gtest/gtest.h" +#include #include #include "core/framework/float16.h" #include "core/mlas/inc/mlas_q4.h" -#include "blkq4_fp16_gemm_sm80.h" - -#include "gtest/gtest.h" - namespace onnxruntime { namespace test { @@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) { const auto meta_shape = Base::get_quant_meta_shape(rows, columns); const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( rows, columns, dequants, q_weights, q_scales, q_zp); diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu index 69c929d446ce4..210c33933d90d 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu @@ -11,9 +11,11 @@ * well with gtest headers. */ +#include "blkq4_fp16_gemm_sm80.h" + #include -#include #include +#include #include "core/mickey/blk_q4/f16_gemm_sm80.h" @@ -26,13 +28,11 @@ #include "core/common/common.h" -#include "blkq4_fp16_gemm_sm80.h" - namespace onnxruntime { -namespace cuda{ -namespace test{ +namespace cuda { +namespace test { -Status sm80_supported(){ +Status sm80_supported() { cudaDeviceProp props; cudaError_t error = cudaGetDeviceProperties(&props, 0); @@ -55,27 +55,25 @@ Status sm80_supported(){ * Copied directly from cutlass util/reference/device/gemm.h * for the strange reason that compiler insists on asking * for explicit stream argument in kernel launch. -*/ + */ template < - typename ElementA, - typename LayoutA, - typename ElementB, - typename LayoutB, - typename ElementC, - typename LayoutC, - typename ScalarType, - typename AccumulatorType -> + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ScalarType, + typename AccumulatorType> void compute_gemm_ref( - cutlass::gemm::GemmCoord problem_size, - ScalarType alpha, - cutlass::TensorRef tensor_a, - cutlass::TensorRef tensor_b, - ScalarType beta, - cutlass::TensorRef tensor_c, - cutlass::TensorRef tensor_d, - AccumulatorType initial_accum = AccumulatorType(0)) { - + cutlass::gemm::GemmCoord problem_size, + ScalarType alpha, + cutlass::TensorRef tensor_a, + cutlass::TensorRef tensor_b, + ScalarType beta, + cutlass::TensorRef tensor_c, + cutlass::TensorRef tensor_d, + AccumulatorType initial_accum = AccumulatorType(0)) { // Blocking structure potentially improves performance of reference implementation // with a minor increase in complexity. // @@ -85,30 +83,27 @@ void compute_gemm_ref( dim3 block(16, 8); dim3 grid( - (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), - (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn) - ); + (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), + (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)); // Launch a GEMM kernel cutlass::reference::device::kernel::Gemm< - cutlass::TensorRef, - cutlass::TensorRef, - cutlass::TensorRef, - ScalarType, - AccumulatorType, - OutputTile, - cutlass::multiply_add, - cutlass::NumericConverter - ><<>>( - problem_size, - alpha, - tensor_a, - tensor_b, - beta, - tensor_c, - tensor_d, - initial_accum - ); + cutlass::TensorRef, + cutlass::TensorRef, + cutlass::TensorRef, + ScalarType, + AccumulatorType, + OutputTile, + cutlass::multiply_add, + cutlass::NumericConverter><<>>( + problem_size, + alpha, + tensor_a, + tensor_b, + beta, + tensor_c, + tensor_d, + initial_accum); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,28 +112,31 @@ void compute_gemm_ref( // template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_MatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_MatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); - auto* ptr = const_cast::type *>(tensor.host_data()); + auto* ptr = const_cast::type*>(tensor.host_data()); return MatrixRef(ptr, tensor.capacity(), shape); } template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_ConstMatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_ConstMatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); return MatrixRef(tensor.host_data(), tensor.capacity(), shape); } @@ -147,7 +145,7 @@ MatrixRef make_ConstMatrixRef(cutlass::HostTensor, - cutlass::MatrixShape<1, block_size>>::type; + typename std::conditional, + cutlass::MatrixShape<1, block_size>>::type; using GemmRunner = BlkQ4F16GemmImpl; @@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputQScale = typename GemmRunner::LayoutInputQScale; const cutlass::gemm::GemmCoord problem_size = {m, n, k}; - const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n()); - const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn); + const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n()); + const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() / + QuantBlocking::kColumn); // // Generate quantized and dequantizeed input matrix B [K, N] // static_assert(std::is_same::value); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp); @@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) { 4, column_wise_blocking>; - std::vector packed_w(q_weight_shape.product()); + thrust::host_vector packed_w(q_weight_shape.product()); PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w); - std::vector packed_scales(meta_shape.product()); + thrust::host_vector packed_scales(meta_shape.product()); PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales); - std::vector packed_zp; + thrust::host_vector packed_zp; if constexpr (has_offsets) { packed_zp.resize(meta_shape.product()); PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp); @@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) { // thrust::device_vector d_packed_w(packed_w); cutlass::TensorRef ref_W( - reinterpret_cast(d_packed_w.data().get()), - LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2})); + reinterpret_cast(d_packed_w.data().get()), + LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2})); thrust::device_vector d_packed_scales(packed_scales); cutlass::TensorRef ref_scales( - d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); thrust::device_vector d_packed_zp(packed_zp); cutlass::TensorRef ref_zp( - d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); tensor_a.sync_device(); tensor_c.sync_device(); @@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) { // run GEMM cutlass::Status status; - if constexpr (has_offsets){ + if constexpr (has_offsets) { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, ref_zp, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, ref_zp, + tensor_c.device_ref(), tensor_d.device_ref()); } else { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, + tensor_c.device_ref(), tensor_d.device_ref()); } ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status)); @@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputB = cutlass::layout::ColumnMajor; thrust::device_vector d_dequants(dequants); cutlass::TensorRef ref_B( - d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); + d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); cutlass::HostTensor tensor_ref_d( problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from // reference kernel @@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) { ElementComputeEpilogue beta = ElementComputeEpilogue(0); compute_gemm_ref( + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementComputeEpilogue, ElementAccumulator>( problem_size, alpha, tensor_a.device_ref(), @@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) { tensor_c.device_ref(), tensor_ref_d.device_ref()); - // Wait for kernels to finish + //// Wait for kernels to finish cudaDeviceSynchronize(); - // Copy output data from CUTLASS and reference kernel to host for comparison + //// Copy output data from CUTLASS and reference kernel to host for comparison tensor_d.sync_host(); tensor_ref_d.sync_host(); - // Check if output from CUTLASS kernel and reference kernel are equal or not + //// Check if output from CUTLASS kernel and reference kernel are equal or not bool passed = cutlass::reference::host::TensorEquals( - tensor_d.host_view(), - tensor_ref_d.host_view()); + tensor_d.host_view(), + tensor_ref_d.host_view()); ORT_ENFORCE(passed, "Gemm kernel result wrong!"); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index 8dfaaedcbb378..72357ec7e02d2 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -5,11 +5,14 @@ // extra code in the core of CUDA EP and that code may // 1. slow down performance critical applications and // 2. increase binary size of ORT. + +#include "gtest/gtest.h" #include -#include "core/providers/cuda/cuda_execution_provider.h" + +#include "core/framework/run_options.h" #include "core/providers/cuda/cuda_allocator.h" +#include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_stream_handle.h" -#include "gtest/gtest.h" namespace onnxruntime { namespace cuda { @@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) { CUDAExecutionProvider ep(info); AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0]; - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; // Allocator for call cudaMallocHost and cudaFreeHost // For details, see CUDAPinnedAllocator in cuda_allocator.cc. @@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) { CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc index 96c1e173316de..d8384b432786b 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc @@ -6,12 +6,11 @@ #include "core/providers/cuda/cuda_provider_factory_creator.h" #include "core/providers/cuda/cuda_provider_options.h" +#include "gtest/gtest.h" #include #include #include "core/common/gsl.h" -#include "gtest/gtest.h" - #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_allocator.h" @@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { void cuda__Impl_Cast(void*, const float*, double*, size_t) override {} - Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } - void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } + Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + return CudaCall(cudaError(retCode), exprString, libName, + cudaError(successCode), msg, file, line); + } + void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); + } void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {} @@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { return nullptr; } - std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override { + std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, + onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, + const OrtArenaCfg*) override { return nullptr; } void TestAll() override { - // TestAll is the entry point of CUDA EP's insternal tests. + // TestAll is the entry point of CUDA EP's internal tests. // Those internal tests are not directly callable from onnxruntime_test_all // because CUDA EP is a shared library now. // Instead, this is a test provider that implements all the test cases. // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test. - int argc = 1; - std::string mock_exe_name = "onnxruntime_providers_cuda_ut"; - char* argv[] = {const_cast(mock_exe_name.data())}; + char mock_exe_name[] = "onnxruntime_providers_cuda_ut"; + + // InitGoogleTest decrements argc and removes args from argv if + // recognized. By doing so it decrements argc and shifts argv, + // to do so, from the code comments it expects argc + 1 with the last one always being nullptr + // otherwise, windows diagnostics reports stack corruption. when + int argc = 1; // Change argc to 2 and edit the filter below if necessary + char* argv[] = {mock_exe_name, nullptr}; + // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr}; ::testing::InitGoogleTest(&argc, argv); ORT_ENFORCE(RUN_ALL_TESTS() == 0); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc index 9d20bc545df5f..7468a5718425e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include #include -#include "gtest/gtest.h" - #include "core/common/common.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/providers/cuda/shared_inc/cuda_utils.h" @@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) { Fill(nullptr, buffer.get(), value, num_elements); auto cpu_buffer = std::make_unique(num_elements); - CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), + cudaMemcpyKind::cudaMemcpyDeviceToHost)); std::vector expected_data(num_elements, value); EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0); diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 4917701e5197d..6636e15040393 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include "core/common/common.h" #include "core/providers/cuda/cuda_common.h" -#include "gtest/gtest.h" - namespace onnxruntime { namespace cuda { namespace test { diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index c460e806c1a80..ec7e98528504e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include - #include "gtest/gtest.h" +#include + #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/common/optional.h" #include "core/providers/cuda/reduction/reduction_functions.h" diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index b7232e9dc4ba1..0e885b71b486a 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -150,7 +150,9 @@ stages: --enable_cuda_profiling --enable_cuda_nhwc_ops \ --enable_pybind --build_java \ --use_cache \ - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75; \ + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \ + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \ + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \ ccache -sv; \ ccache -z" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index eee38ac04b355..291e2f4e19401 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -42,7 +42,12 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --enable_cuda_profiling + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -59,7 +64,10 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --skip_onnx_tests + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -95,7 +103,11 @@ stages: EnvSetupScript: setup_env_cuda.bat buildArch: x64 # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags - additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF + additionalBuildFlags: >- + --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda + --cuda_home="$(Agent.TempDirectory)\v11.8" + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo From a9d9b083e425f102b1e1fbc9ed4d715afc392aab Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:59:35 -0700 Subject: [PATCH 8/9] Fix py package pipeline (#20065) ### Description ### Motivation and Context Fixes #20068 --- onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu index 5e6e484567988..c299cdcfe6a3d 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu @@ -656,7 +656,9 @@ inline __device__ float4 operator*(const float4 a, const float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 +// TODO(wy): use cuda common header and investigate pipeline build issue. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \ + ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2))) inline __device__ half operator*(const half a, const half b) { return __float2half(__half2float(a) * __half2float(b)); } @@ -666,8 +668,10 @@ inline __device__ half2 operator*(const half2 a, const half2 b) { } #endif +// TODO(wy): use cuda common header and investigate pipeline build issue. inline __device__ Half4 operator*(const Half4 a, const Half4 b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \ + ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2))) Half4 result; result.x = a.x * b.x; result.y = a.y * b.y; From 55f63a48ca07ceb7480756b68a7795a221d3a45d Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 28 Mar 2024 08:40:34 +0800 Subject: [PATCH 9/9] Keep original name during fusion (#20097) ### Keep original name during fusion This could be helpful to know where the fused node coming from, I feel this is very useful when debugging the execution order issues between different transformer layers. For example: - A node named `/_original_module/model/layers.1/self_attn/MatMul/MatmulTransposeFusion//MatMulScaleFusion/` goes through two fusion paths in the 1st transformer layer - e.g. `MatmulTransposeFusion` and `MatMulScaleFusion`. - `/_original_module/model/layers.2/post_attention_layernorm/Mul_1/SimplifiedLayerNormFusion/` node is a fused node by `SimplifiedLayerNormFusion`. ### Motivation and Context --- onnxruntime/core/optimizer/gather_fusion.cc | 2 +- onnxruntime/core/optimizer/gemm_transpose_fusion.cc | 2 +- onnxruntime/core/optimizer/layer_norm_fusion.cc | 4 ++-- onnxruntime/core/optimizer/matmul_scale_fusion.cc | 2 +- onnxruntime/core/optimizer/matmul_transpose_fusion.cc | 6 +++--- onnxruntime/core/optimizer/quick_gelu_fusion.cc | 2 +- onnxruntime/test/optimizer/graph_transform_test.cc | 6 +++--- .../orttraining/core/optimizer/concat_replacement.cc | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc index 90cabff88122c..1f2b31526c6b8 100644 --- a/onnxruntime/core/optimizer/gather_fusion.cc +++ b/onnxruntime/core/optimizer/gather_fusion.cc @@ -273,7 +273,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra split_initializer_proto.add_dims(static_cast(split_values.size())); split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end()); NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto); - Node& split_node = graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes", + Node& split_node = graph.AddNode(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion/", "Split", "Split for Fused Gather nodes", {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs); split_node.AddAttribute("axis", axis); split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType()); diff --git a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc index b97cce9c2e785..a52517d23db86 100644 --- a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc +++ b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc @@ -75,7 +75,7 @@ Status GemmTransposeFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& m nodes_to_remove.push_back(output_node); } - Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "_transformed"), + Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "/GemmTransposeFusion/"), gemm_node.OpType(), "Fused Gemm with Transpose", new_gemm_input_defs, diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc index ce696154adb6d..48edf4854fbbb 100644 --- a/onnxruntime/core/optimizer/layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc @@ -455,7 +455,7 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, } InlinedVector layer_norm_input_defs{x_input, scale, bias}; - Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName("LayerNormalization"), + Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/LayerNormFusion/"), "LayerNormalization", "fused LayerNorm subgraphs ", layer_norm_input_defs, @@ -705,7 +705,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr InlinedVector layer_norm_input_defs{x_input, scale}; Node& layer_norm_node = - graph.AddNode(graph.GenerateNodeName("SimplifiedLayerNormalization"), "SimplifiedLayerNormalization", + graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/SimplifiedLayerNormFusion/"), "SimplifiedLayerNormalization", "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, {}, kOnnxDomain); // Get constant "epsilon" from "Add" node if available. Else, default value will be used. diff --git a/onnxruntime/core/optimizer/matmul_scale_fusion.cc b/onnxruntime/core/optimizer/matmul_scale_fusion.cc index b04d794cc9469..e4cdeadbf54d7 100644 --- a/onnxruntime/core/optimizer/matmul_scale_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_scale_fusion.cc @@ -245,7 +245,7 @@ Status ProcessNode( } Node& matmul_scale_node = graph.AddNode( - graph.GenerateNodeName(node.Name() + "_FusedMatMulAndScale"), + graph.GenerateNodeName(node.Name() + "/MatMulScaleFusion/"), "FusedMatMul", "Fused MatMul and Scale", fused_node_inputs, diff --git a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc index 789466778edc6..8eb224013618d 100644 --- a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc @@ -154,14 +154,14 @@ static Node* ReorderCastAndTranspose(Graph& graph, Node* cast, const ONNX_NAMESPACE::TensorProto_DataType element_type = static_cast(cast_output->TypeAsProto()->tensor_type().elem_type()); new_cast_output_type_proto.mutable_tensor_type()->set_elem_type(element_type); - auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "_transformed", &new_cast_output_type_proto); + auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "/MatmulTransposeFusion/", &new_cast_output_type_proto); const std::array new_cast_input_defs{transpose_input}; const std::array new_cast_output_defs{&new_cast_output}; const std::array new_transpose_input_defs = {&new_cast_output}; const std::array new_transpose_output_defs = {cast_output}; - Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "_transformed"), + Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "/MatmulTransposeFusion/"), cast->OpType(), "Created a new Cast node to interchange Cast and Transpose nodes", new_cast_input_defs, @@ -385,7 +385,7 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_ const std::array input_defs{left_input, right_input}; const std::array output_defs{node.MutableOutputDefs()[0]}; - Node& matmul_node = graph.AddNode(graph.GenerateNodeName("MatMul_With_Transpose"), + Node& matmul_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "/MatmulTransposeFusion/"), "FusedMatMul", "fused MatMul and Transpose ", input_defs, diff --git a/onnxruntime/core/optimizer/quick_gelu_fusion.cc b/onnxruntime/core/optimizer/quick_gelu_fusion.cc index 6e5eb5612a701..b09ef1c460b8e 100644 --- a/onnxruntime/core/optimizer/quick_gelu_fusion.cc +++ b/onnxruntime/core/optimizer/quick_gelu_fusion.cc @@ -88,7 +88,7 @@ Status QuickGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, NodeArg* quick_gelu_output_arg = mul_node.MutableOutputDefs()[0]; Node& quick_gelu_node = - graph.AddNode(graph.GenerateNodeName("QuickGelu"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg}, + graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/QuickGeluFusion/"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg}, std::array{quick_gelu_output_arg}, {}, kMSDomain); quick_gelu_node.AddAttribute("alpha", alpha); quick_gelu_node.SetExecutionProviderType(node.GetExecutionProviderType()); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 97f1feaaa612d..0d1f213618e54 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -2724,7 +2724,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTranspose) { auto gemm_node = std::find_if( graph.Nodes().cbegin(), graph.Nodes().cend(), - [](const Node& node) { return node.Name() == "Gemm_transformed"; }); + [](const Node& node) { return node.Name() == "Gemm/GemmTransposeFusion/"; }); auto& node = *gemm_node; ASSERT_TRUE(node.OpType() == "Gemm"); @@ -2760,7 +2760,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm auto gemm1_node = std::find_if( graph.Nodes().cbegin(), graph.Nodes().cend(), - [](const Node& node) { return node.Name() == "Gemm1_transformed"; }); + [](const Node& node) { return node.Name() == "Gemm1/GemmTransposeFusion/"; }); auto& node1 = *gemm1_node; ASSERT_TRUE(node1.OpType() == "Gemm"); @@ -2773,7 +2773,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm auto gemm2_node = std::find_if( graph.Nodes().cbegin(), graph.Nodes().cend(), - [](const Node& node) { return node.Name() == "Gemm2_transformed"; }); + [](const Node& node) { return node.Name() == "Gemm2/GemmTransposeFusion/"; }); auto& node2 = *gemm2_node; ASSERT_TRUE(node2.OpType() == "Gemm"); diff --git a/orttraining/orttraining/core/optimizer/concat_replacement.cc b/orttraining/orttraining/core/optimizer/concat_replacement.cc index 37d302765cda8..2c919591ec081 100644 --- a/orttraining/orttraining/core/optimizer/concat_replacement.cc +++ b/orttraining/orttraining/core/optimizer/concat_replacement.cc @@ -23,7 +23,7 @@ Status ConcatReplacement::Apply(Graph& graph, Node& concat_node, RewriteRuleEffe concat_outputs.push_back(&ip_shape_op); - Node& concat_training_node = graph.AddNode(graph.GenerateNodeName("ConcatTraining"), + Node& concat_training_node = graph.AddNode(graph.GenerateNodeName(concat_node.Name() + "/ConcatReplacement/"), "ConcatTraining", "Concat with extra output", concat_inputs,