From d1118e959ffeebd7fb9ada2918f11fe1b0ddae20 Mon Sep 17 00:00:00 2001 From: Jingyan Wang Date: Tue, 22 Oct 2024 19:36:30 +0000 Subject: [PATCH 1/4] Add trt accuracy test --- .../python/onnxruntime_test_python_trt_acc.py | 351 ++++++++++++++++++ tools/ci_build/build.py | 11 +- 2 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 onnxruntime/test/python/onnxruntime_test_python_trt_acc.py diff --git a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py new file mode 100644 index 0000000000000..9cf3a1523756e --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py @@ -0,0 +1,351 @@ +""" +This test compares output of below huggingface models +- "microsoft/resnet-50" +- "microsoft/Phi-3.5-mini-instruct" +on Pytorch cpu vs [ORT CPU EP, ORT TensorRT EP] with different configuations (fp16, no ort graph optimization, 1 layer transformer vs full model) +""" +from transformers import AutoImageProcessor, ResNetForImageClassification +from transformers import AutoModel, AutoTokenizer +from transformers import AutoModelForCausalLM +import torch +from transformers.onnx import export +import onnxruntime as ort +import numpy as np +import time +import unittest +import onnx +from onnx import helper, TensorProto + +def run_model_in_pytorch(model, inputs): + with torch.no_grad(): + output = model(**inputs).logits + return output + +def run_model_in_ort(model_file, inputs, ep, disable_ort_graph_optimization=False): + if disable_ort_graph_optimization: + sess_opt = ort.SessionOptions() + sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL + else: + sess_opt = None + session = ort.InferenceSession(model_file, providers=ep, sess_opt=sess_opt) + # model_inputs = session.get_inputs() + # input_data = np.array(input_tensor) + # outputs = session.run(None, {model_inputs[0].name: input_data}) + outputs = session.run(None, inputs) + output = np.array(outputs[0]) + return output + + +def get_model_and_inputs(model_name, use_minimal_model=True): + if model_name == "microsoft/resnet-50": + model = ResNetForImageClassification.from_pretrained(model_name) + # if use_minimal_model: + # model.config.num_channels = 1 + # model.config.embedding_size = 1 + # model.config.hidden_sizes = [1, 2] + # model.config.depths = [1, 2] + input_tensor = torch.randn(1, 3, 224, 224) + pytorch_inputs = {'pixel_values': input_tensor} + # inputs key value need to match forward() + ort_inputs = {'pixel_values': input_tensor.numpy()} + elif model_name == "microsoft/Phi-3.5-mini-instruct": + model = AutoModelForCausalLM.from_pretrained(model_name) + if use_minimal_model: + print(f"Using 1 layer model for {model_name}") + # Reduce the number of hidden layers (for example, keeping only 1 layer) + model.model.layers = model.model.layers[:1] + # Update the configuration to reflect the reduced number of layers + model.config.num_hidden_layers = 1 # default 32 + # input_tensor = torch.randint(0, model.config.vocab_size, (1, 30)) # Batch size 1, sequence length 30 + # inputs = {'input_ids': random_input_ids} + else: + print(f"Using full model for {model_name}") + # model.model.layers = model.model.layers[:4] + # # Update the configuration to reflect the reduced number of layers + # model.config.num_hidden_layers = 4 # default 32 + dim = (1, 30) + input_ids = torch.randint(0, 32064, dim) # 32064 is vocab size + attention_masks = torch.ones(*dim, dtype=torch.int64) + + # Prepare the inputs for the model + pytorch_inputs = {'input_ids': input_ids, 'attention_mask':attention_masks} + # inputs key value need to match forward() + ort_inputs = { + 'input_ids': pytorch_inputs['input_ids'].numpy(), + 'attention_mask': pytorch_inputs['attention_mask'].numpy(), + 'onnx::Neg_2': torch.ones(1, dtype=torch.int64).numpy() # ORT requires this input since it's in the exported graph + } + return model, pytorch_inputs, ort_inputs + +def get_ep(use_tensorrt=True, use_fp16=True): + if not use_tensorrt: + return [('CPUExecutionProvider', {})] + else: + return [ + ('TensorrtExecutionProvider', {'trt_fp16_enable': use_fp16}) + ] + +""" +This hacky fix is required to fix onnx model graph. +Some slice nodes are missing starts/end attributes after onnx.export +""" +def fix_phi35_model(onnx_model_filename): + model = onnx.load(onnx_model_filename) + graph = model.graph + + # Iterate through nodes to find the node by name + for node in graph.node: + if node.name == "/model/layers.0/mlp/Slice_1": + # print(f"Found node: {node.name}") + # print(node) # Print the details of the node + # print(node.input) + node.input[1] = "/model/layers.0/mlp/Constant_6_output_0" # starts + node.input[2] = "/model/layers.0/mlp/Constant_7_output_0" # ends + + if node.name == "/model/layers.0/mlp/Slice": + # print(f"Found node: {node.name}") + # print(node) # Print the details of the node + # print(node.input) + node.input[2] = "/model/layers.0/mlp/Constant_6_output_0" # ends + + if node.name == "/Slice": + # print(f"Found node: {node.name}") + # print(node) # Print the details of the node + # print(node.input) + node.input[1] = "/Constant41_output_0" + # return + # if node.name == "/model/layers.0/mlp/Mul_output_0": + # print(f"Found node: {node.name}") + # print(node) # Print the details of the node + # # return + # if node.name == "/model/layers.0/mlp/Constant_1_output_0": + # print(f"Found node: {node.name}") + # print(node) # Print the details of the node + # if node.name == "/model/layers.0/mlp/Mul_1": + # print(node) + # if node.name == "/model/layers.0/mlp/Constant_1": + # print(node) + + # for initializer in graph.initializer: + # print(f"Name: {initializer.name}") + # tensor_value = onnx.numpy_helper.to_array(initializer) + # print(f"Value: {tensor_value}") + # print(tensor_value) + # if initializer.name == "/model/layers.0/mlp/Mul_output_0": + # print(f"Tensor '{initializer.name}' found in initializers.") + # tensor_value = numpy_helper.to_array(initializer) + # print(f"Value: {tensor_value}") + # print(tensor_value) + # # return tensor_value + # if initializer.name == "/model/layers.0/mlp/Constant_1_output_0": + # print(f"Tensor '{initializer.name}' found in initializers.") + # tensor_value = numpy_helper.to_array(initializer) + # print(f"Value: {tensor_value}") + # print(node) + + # for node in graph.output: + # print(node) + # if node.name == "/model/layers.0/mlp/Mul_output_0": + # print(f"Tensor '{node.name}' found (op_type: {node.op_type}) .") + # print(node) + # # return node + # if node.name == "/model/layers.0/mlp/Constant_1_output_0": + # print(f"Tensor '{node.name}' found (op_type: {node.op_type}) .") + # print(node) + + # for node in graph.node: + # if node.op_type == "Constant": + # print(node) + + # print(f"Node '{node_name}' not found in the model.") + # data = np.array([8192], dtype=np.int64) + # # raw_bytes = data.tobytes() + # # # raw_bytes = struct(' Date: Tue, 22 Oct 2024 20:14:20 +0000 Subject: [PATCH 2/4] Clean up --- .../python/onnxruntime_test_python_trt_acc.py | 246 ++++++------------ tools/ci_build/build.py | 1 + 2 files changed, 74 insertions(+), 173 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py index 9cf3a1523756e..9683317b37733 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py +++ b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py @@ -1,8 +1,8 @@ """ This test compares output of below huggingface models -- "microsoft/resnet-50" -- "microsoft/Phi-3.5-mini-instruct" -on Pytorch cpu vs [ORT CPU EP, ORT TensorRT EP] with different configuations (fp16, no ort graph optimization, 1 layer transformer vs full model) +- microsoft/resnet-18 and microsoft/resnet-50 +- microsoft/Phi-3.5-mini-instruct with 1 layer transformer vs full model +on Pytorch cpu vs [ORT CPU EP, ORT TensorRT EP] with different configuations [fp16, no ort graph optimization]). """ from transformers import AutoImageProcessor, ResNetForImageClassification from transformers import AutoModel, AutoTokenizer @@ -21,29 +21,21 @@ def run_model_in_pytorch(model, inputs): output = model(**inputs).logits return output -def run_model_in_ort(model_file, inputs, ep, disable_ort_graph_optimization=False): - if disable_ort_graph_optimization: +def run_model_in_ort(model_file, inputs, ep, use_graph_opt=True): + if use_graph_opt: + sess_opt = None + else: sess_opt = ort.SessionOptions() sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL - else: - sess_opt = None session = ort.InferenceSession(model_file, providers=ep, sess_opt=sess_opt) - # model_inputs = session.get_inputs() - # input_data = np.array(input_tensor) - # outputs = session.run(None, {model_inputs[0].name: input_data}) outputs = session.run(None, inputs) output = np.array(outputs[0]) return output def get_model_and_inputs(model_name, use_minimal_model=True): - if model_name == "microsoft/resnet-50": + if model_name == "microsoft/resnet-50" or model_name == "microsoft/resnet-18": model = ResNetForImageClassification.from_pretrained(model_name) - # if use_minimal_model: - # model.config.num_channels = 1 - # model.config.embedding_size = 1 - # model.config.hidden_sizes = [1, 2] - # model.config.depths = [1, 2] input_tensor = torch.randn(1, 3, 224, 224) pytorch_inputs = {'pixel_values': input_tensor} # inputs key value need to match forward() @@ -56,8 +48,6 @@ def get_model_and_inputs(model_name, use_minimal_model=True): model.model.layers = model.model.layers[:1] # Update the configuration to reflect the reduced number of layers model.config.num_hidden_layers = 1 # default 32 - # input_tensor = torch.randint(0, model.config.vocab_size, (1, 30)) # Batch size 1, sequence length 30 - # inputs = {'input_ids': random_input_ids} else: print(f"Using full model for {model_name}") # model.model.layers = model.model.layers[:4] @@ -96,110 +86,39 @@ def fix_phi35_model(onnx_model_filename): # Iterate through nodes to find the node by name for node in graph.node: if node.name == "/model/layers.0/mlp/Slice_1": - # print(f"Found node: {node.name}") - # print(node) # Print the details of the node - # print(node.input) - node.input[1] = "/model/layers.0/mlp/Constant_6_output_0" # starts - node.input[2] = "/model/layers.0/mlp/Constant_7_output_0" # ends + node.input[1] = "/model/layers.0/mlp/Constant_6_output_0" # starts attribute + node.input[2] = "/model/layers.0/mlp/Constant_7_output_0" # ends attribute if node.name == "/model/layers.0/mlp/Slice": - # print(f"Found node: {node.name}") - # print(node) # Print the details of the node - # print(node.input) - node.input[2] = "/model/layers.0/mlp/Constant_6_output_0" # ends + node.input[2] = "/model/layers.0/mlp/Constant_6_output_0" # ends attribute if node.name == "/Slice": - # print(f"Found node: {node.name}") - # print(node) # Print the details of the node - # print(node.input) - node.input[1] = "/Constant41_output_0" - # return - # if node.name == "/model/layers.0/mlp/Mul_output_0": - # print(f"Found node: {node.name}") - # print(node) # Print the details of the node - # # return - # if node.name == "/model/layers.0/mlp/Constant_1_output_0": - # print(f"Found node: {node.name}") - # print(node) # Print the details of the node - # if node.name == "/model/layers.0/mlp/Mul_1": - # print(node) - # if node.name == "/model/layers.0/mlp/Constant_1": - # print(node) - - # for initializer in graph.initializer: - # print(f"Name: {initializer.name}") - # tensor_value = onnx.numpy_helper.to_array(initializer) - # print(f"Value: {tensor_value}") - # print(tensor_value) - # if initializer.name == "/model/layers.0/mlp/Mul_output_0": - # print(f"Tensor '{initializer.name}' found in initializers.") - # tensor_value = numpy_helper.to_array(initializer) - # print(f"Value: {tensor_value}") - # print(tensor_value) - # # return tensor_value - # if initializer.name == "/model/layers.0/mlp/Constant_1_output_0": - # print(f"Tensor '{initializer.name}' found in initializers.") - # tensor_value = numpy_helper.to_array(initializer) - # print(f"Value: {tensor_value}") - # print(node) - - # for node in graph.output: - # print(node) - # if node.name == "/model/layers.0/mlp/Mul_output_0": - # print(f"Tensor '{node.name}' found (op_type: {node.op_type}) .") - # print(node) - # # return node - # if node.name == "/model/layers.0/mlp/Constant_1_output_0": - # print(f"Tensor '{node.name}' found (op_type: {node.op_type}) .") - # print(node) - - # for node in graph.node: - # if node.op_type == "Constant": - # print(node) + node.input[1] = "/Constant41_output_0" # ends attribute - # print(f"Node '{node_name}' not found in the model.") - # data = np.array([8192], dtype=np.int64) - # # raw_bytes = data.tobytes() - # # # raw_bytes = struct(' Date: Tue, 22 Oct 2024 20:38:18 +0000 Subject: [PATCH 3/4] Add github issue link --- onnxruntime/test/python/onnxruntime_test_python_trt_acc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py index 9683317b37733..6ff9ac2502f01 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py +++ b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py @@ -76,7 +76,7 @@ def get_ep(use_tensorrt=True, use_fp16=True): ] """ -This hacky fix is required to fix onnx model graph. +This hacky fix is required to fix onnx model graph. Github issue: https://github.com/pytorch/pytorch/issues/138637 Some slice nodes are missing starts/end attributes after onnx.export """ def fix_phi35_model(onnx_model_filename): From 3a10abc9f5d5226c4dbd86af769129e652d653cd Mon Sep 17 00:00:00 2001 From: Jingyan Wang Date: Wed, 13 Nov 2024 20:08:58 +0000 Subject: [PATCH 4/4] Add cuda EP --- .../python/onnxruntime_test_python_trt_acc.py | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py index 6ff9ac2502f01..fe28160021a77 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py +++ b/onnxruntime/test/python/onnxruntime_test_python_trt_acc.py @@ -22,6 +22,7 @@ def run_model_in_pytorch(model, inputs): return output def run_model_in_ort(model_file, inputs, ep, use_graph_opt=True): + ort.set_default_logger_severity(0) if use_graph_opt: sess_opt = None else: @@ -32,7 +33,6 @@ def run_model_in_ort(model_file, inputs, ep, use_graph_opt=True): output = np.array(outputs[0]) return output - def get_model_and_inputs(model_name, use_minimal_model=True): if model_name == "microsoft/resnet-50" or model_name == "microsoft/resnet-18": model = ResNetForImageClassification.from_pretrained(model_name) @@ -50,9 +50,9 @@ def get_model_and_inputs(model_name, use_minimal_model=True): model.config.num_hidden_layers = 1 # default 32 else: print(f"Using full model for {model_name}") - # model.model.layers = model.model.layers[:4] + model.model.layers = model.model.layers[:2] # # Update the configuration to reflect the reduced number of layers - # model.config.num_hidden_layers = 4 # default 32 + model.config.num_hidden_layers = 2 # default 32 dim = (1, 30) input_ids = torch.randint(0, 32064, dim) # 32064 is vocab size attention_masks = torch.ones(*dim, dtype=torch.int64) @@ -67,13 +67,20 @@ def get_model_and_inputs(model_name, use_minimal_model=True): } return model, pytorch_inputs, ort_inputs -def get_ep(use_tensorrt=True, use_fp16=True): - if not use_tensorrt: - return [('CPUExecutionProvider', {})] +def get_ep(ep_name='CPUExecutionProvider', use_fp16=True): + # Check valid EP, fall back to CPU + if not ep_name in ['CPUExecutionProvider', 'CUDAExecutionProvider', 'TensorrtExecutionProvider']: + ep_name = 'CPUExecutionProvider' + # if not use_tensorrt: + # return [('CPUExecutionProvider', {})] + # else: + # return [ + # ('TensorrtExecutionProvider', {'trt_fp16_enable': use_fp16}) + # ] + if ep_name == 'TensorrtExecutionProvider': + return [('TensorrtExecutionProvider', {'trt_fp16_enable': use_fp16})] else: - return [ - ('TensorrtExecutionProvider', {'trt_fp16_enable': use_fp16}) - ] + return [(ep_name, {})] """ This hacky fix is required to fix onnx model graph. Github issue: https://github.com/pytorch/pytorch/issues/138637 @@ -149,7 +156,7 @@ def fix_phi35_model(onnx_model_filename): # Overwrite old model file with external weights since Phi3.5 full model exeeds 2GB onnx.save_model(model, onnx_model_filename, save_as_external_data=True, all_tensors_to_one_file=True, location="external_weights", size_threshold=1024, convert_attribute=False) -def run_comparison(self, model_name, use_minimal_model=True, use_tensorrt=True, use_fp16=True, use_graph_opt=True, rtol=1e-2, atol=1e-2): +def run_comparison(self, model_name, use_minimal_model=True, ep_name='CPUExecutionProvider', use_fp16=True, use_graph_opt=True, rtol=1e-2, atol=1e-2): start_time = time.time() model, pytorch_inputs, ort_inputs = get_model_and_inputs(model_name, use_minimal_model) pytorch_output = run_model_in_pytorch(model, pytorch_inputs) @@ -161,7 +168,7 @@ def run_comparison(self, model_name, use_minimal_model=True, use_tensorrt=True, torch.onnx.export(model, pytorch_inputs, model_file, input_names=input_names) if model_name == "microsoft/Phi-3.5-mini-instruct": fix_phi35_model(model_file) - providers = get_ep(use_tensorrt, use_fp16) + providers = get_ep(ep_name, use_fp16) ort_output = run_model_in_ort(model_file, ort_inputs, providers, use_graph_opt=use_graph_opt) # print(f"pytorch_output={pytorch_output}") # print(f"ort_output={ort_output}") @@ -192,27 +199,27 @@ class TestResnetAccuracy(unittest.TestCase): def test_resnet18_cpu_fp32_wo_opt(self): run_comparison(self, "microsoft/resnet-18", - use_minimal_model=False, use_tensorrt=False, use_fp16=False, use_graph_opt=False) + use_minimal_model=False, ep_name='CPUExecutionProvider', use_fp16=False, use_graph_opt=False) def test_resnet18_cpu_fp32(self): run_comparison(self, "microsoft/resnet-18", - use_minimal_model=False, use_tensorrt=False, use_fp16=False, use_graph_opt=True) - - def test_resnet18_cpu_fp32(self): - run_comparison(self, "microsoft/resnet-18", - use_minimal_model=False, use_tensorrt=True, use_fp16=False, use_graph_opt=True) + use_minimal_model=False, ep_name='CPUExecutionProvider', use_fp16=False, use_graph_opt=True) def test_resnet18_trt_fp32(self): run_comparison(self, "microsoft/resnet-18", - use_minimal_model=False, use_tensorrt=True, use_fp16=True, use_graph_opt=True) + use_minimal_model=False, ep_name='TensorrtExecutionProvider', use_fp16=True, use_graph_opt=True) def test_resnet18_trt_fp16(self): run_comparison(self, "microsoft/resnet-18", - use_minimal_model=False, use_tensorrt=True, use_fp16=False, use_graph_opt=True) + use_minimal_model=False, ep_name='TensorrtExecutionProvider', use_fp16=False, use_graph_opt=True) def test_resnet50_trt_fp16(self): run_comparison(self, "microsoft/resnet-50", - use_minimal_model=False, use_tensorrt=True, use_fp16=False, use_graph_opt=True) + use_minimal_model=False, ep_name='TensorrtExecutionProvider', use_fp16=False, use_graph_opt=True) + + def test_resnet50_cuda_fp16(self): + run_comparison(self, "microsoft/resnet-50", + use_minimal_model=False, ep_name='CUDAExecutionProvider', use_fp16=False, use_graph_opt=True) """ Test Phi3.5 (1 layer) and full Phi3.5 with different configurations @@ -227,24 +234,32 @@ class TestPhi35Accuracy(unittest.TestCase): def test_phi35_1l_cpu_fp32_wo_opt(self): run_comparison(self, "microsoft/Phi-3.5-mini-instruct", - use_minimal_model=True, use_tensorrt=False, use_fp16=False, use_graph_opt=False) + use_minimal_model=True, ep_name='CPUExecutionProvider', use_fp16=False, use_graph_opt=False) def test_phi35_1l_cpu_fp32(self): run_comparison(self, "microsoft/Phi-3.5-mini-instruct", - use_minimal_model=True, use_tensorrt=False, use_fp16=False, use_graph_opt=True) + use_minimal_model=True, ep_name='CPUExecutionProvider', use_fp16=False, use_graph_opt=True) def test_phi35_1l_trt_fp32(self): run_comparison(self, "microsoft/Phi-3.5-mini-instruct", - use_minimal_model=True, use_tensorrt=True, use_fp16=False, use_graph_opt=True) + use_minimal_model=True, ep_name='TensorrtExecutionProvider', use_fp16=False, use_graph_opt=True) def test_phi35_1l_trt_fp16(self): run_comparison(self, "microsoft/Phi-3.5-mini-instruct", - use_minimal_model=True, use_tensorrt=True, use_fp16=True, use_graph_opt=True, + use_minimal_model=True, ep_name='TensorrtExecutionProvider', use_fp16=True, use_graph_opt=True, rtol=1e-1, atol=1e-1) # Need to relax rtol and atol for fp16 test case to pass - def test_phi35_full_trt_fp16(self): + # def test_phi35_full_trt_fp16(self): + # run_comparison(self, "microsoft/Phi-3.5-mini-instruct", + # use_minimal_model=False, ep_name='TensorrtExecutionProvider', use_fp16=True, use_graph_opt=True) + + def test_phi35_1l_cuda_fp16(self): + run_comparison(self, "microsoft/Phi-3.5-mini-instruct", + use_minimal_model=True, ep_name='CUDAExecutionProvider', use_fp16=True, use_graph_opt=True) + + def test_phi35_full_cuda_fp16(self): run_comparison(self, "microsoft/Phi-3.5-mini-instruct", - use_minimal_model=False, use_tensorrt=True, use_fp16=True, use_graph_opt=True) + use_minimal_model=False, ep_name='CUDAExecutionProvider', use_fp16=True, use_graph_opt=True) if __name__ == "__main__":