Skip to content

Commit

Permalink
Update eval_squad to use API of latest optimum (#17918)
Browse files Browse the repository at this point in the history
Update eval_squad with latest optimum. 

Tested with:
* optimum 1.13.1
* transformers 4.31.0
* onnxruntime-gpu 1.16.0
* onnx 1.14.1
* datasets 2.14.5
* evaluate 0.4.0
* torch version 2.2.0.dev20230920+cu121

Example output in A100:

{'exact': 86.66035950804162, 'f1': 92.99622739711005, 'total': 10570,
'HasAns_exact': 86.66035950804162, 'HasAns_f1': 92.99622739711005,
'HasAns_total': 10570, 'best_exact': 86.66035950804162,
'best_exact_thresh': 0.9998456239700317, 'best_f1': 92.9962273971104,
'best_f1_thresh': 0.9998456239700317, 'total_time_in_seconds':
84.74025378189981, 'samples_per_second': 124.73410838731417,
'latency_in_seconds': 0.008017053337928081, 'provider':
'CUDAExecutionProvider', 'disable_fused_attention': False,
'pretrained_model_name':
'bert-large-uncased-whole-word-masking-finetuned-squad', 'onnx_path':
'./bert-large-uncased-whole-word-masking-finetuned-squad/optimized_model.onnx',
'batch_size': 1, 'sequence_length': 384, 'use_io_binding': True}
  • Loading branch information
tianleiwu authored Oct 13, 2023
1 parent 7551dd0 commit c695de9
Showing 1 changed file with 48 additions and 25 deletions.
73 changes: 48 additions & 25 deletions onnxruntime/python/tools/transformers/models/bert/eval_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,26 @@
# This script evaluates accuracy of ONNX models for question-answering task on SQuAD data set.
# Example to evaluate raw and optimized model for CUDA in Linux:
# pip3 install datasets evaluate optimum transformers onnxruntime-gpu
# python3 eval_squad.py -m distilbert-base-cased-distilled-squad
# python3 -m onnxruntime.transformers.optimizer --output optimized_fp16.onnx --num_heads 12 --hidden_size 768 \
# --input /home/$USER/.cache/huggingface/hub/distilbert-base-cased-distilled-squad/model.onnx \
# --use_mask_index --float16
# python3 eval_squad.py -m distilbert-base-cased-distilled-squad --onnx optimized_fp16.onnx

#
# python3 eval_squad.py -m bert-large-uncased-whole-word-masking-finetuned-squad -s 384 -b 1 --use_io_binding
#
# python3 -m onnxruntime.transformers.optimizer \
# --input ./bert-large-uncased-whole-word-masking-finetuned-squad/model.onnx \
# --output ./bert-large-uncased-whole-word-masking-finetuned-squad/optimized_model.onnx
#
# python3 eval_squad.py -m bert-large-uncased-whole-word-masking-finetuned-squad -s 384 -b 1 --use_io_binding \
# --onnx ./bert-large-uncased-whole-word-masking-finetuned-squad/optimized_model.onnx
#
# Snippet of example output in A100:
# {'exact': 86.65089877010406, 'f1': 92.99433524952254, 'total': 10570, 'HasAns_exact': 86.65089877010406
# 'total_time_in_seconds': 81.69239814393222, 'samples_per_second': 129.387804008115,
# 'latency_in_seconds': 0.007728703703304846, 'provider': 'CUDAExecutionProvider',
# 'pretrained_model_name': 'bert-large-uncased-whole-word-masking-finetuned-squad',
# 'batch_size': 1, 'sequence_length': 384, 'use_io_binding': True}
import argparse
import csv
import os
import time

try:
from importlib.metadata import PackageNotFoundError, version
Expand All @@ -24,17 +35,15 @@
from pathlib import Path
from typing import Any, Dict, List, Optional

import torch
from datasets import load_dataset
from evaluate import evaluator
from optimum.onnxruntime import ORTModelForQuestionAnswering
from optimum.onnxruntime.modeling_ort import ORTModel
from optimum.version import __version__ as optimum_version
from packaging import version as version_check
from transformers import AutoTokenizer, pipeline

if version_check.parse(optimum_version) < version_check.parse("1.6.0"):
raise ImportError(f"Please install optimum>=1.6.0. The version {optimum_version} was found.")
if version_check.parse(optimum_version) < version_check.parse("1.13.1"):
raise ImportError(f"Please install optimum>=1.13.1. Current version: {optimum_version}.")

PRETRAINED_SQUAD_MODELS = [
"bert-large-uncased-whole-word-masking-finetuned-squad",
Expand Down Expand Up @@ -64,23 +73,24 @@ def load_onnx_model(
model: ORTModel for the onnx model
onnx_path: the path of onnx model
"""
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)

if onnx_path is not None:
model.model_name = Path(onnx_path).name

if provider != "CPUExecutionProvider":
model.device = torch.device("cuda:0")
model.model = ORTModel.load_model(onnx_path, provider)
else:
model.device = torch.device("cpu")
model.model = ORTModel.load_model(onnx_path)
if onnx_path is None:
# Export onnx to a sub-directory named by the model id
model = ORTModelForQuestionAnswering.from_pretrained(
model_id, export=True, provider=provider, use_io_binding=use_io_binding
)
save_onnx_dir = os.path.join(".", model_id)
model.save_pretrained(save_onnx_dir)
onnx_path = os.path.join(save_onnx_dir, "model.onnx")
print("Model is exported to onnx file:", onnx_path)
else:
onnx_path = os.path.join(model.model_save_dir.as_posix(), model.model_name)
if provider != "CPUExecutionProvider":
model.to("cuda")

model.use_io_binding = use_io_binding
model = ORTModelForQuestionAnswering.from_pretrained(
os.path.dirname(onnx_path),
file_name=Path(onnx_path).name,
provider=provider,
use_io_binding=use_io_binding,
# provider_options={"enable_skip_layer_norm_strict_mode": True},
)

return model, onnx_path

Expand Down Expand Up @@ -211,7 +221,12 @@ def main():
for sequence_length in args.sequence_lengths:
tokenizer.model_max_length = sequence_length
tokenizer.doc_stride = min(sequence_length // 2, 128)
if args.onnx is None:
print("Exporting onnx model. It might take a few minutes...")
start_time = time.time()
ort_model, onnx_path = load_onnx_model(pretrained_model_name, args.onnx, args.provider, args.use_io_binding)
latency = time.time() - start_time
print(f"Onnx model exported or loaded in {latency:.1f} seconds")

print(ort_model.config)
if sequence_length > ort_model.config.max_position_embeddings:
Expand All @@ -222,14 +237,22 @@ def main():
)

task_evaluator = evaluator("question-answering")
print("Loading dataset...")
start_time = time.time()
squad_dataset = load_dataset("squad", split=f"validation[:{args.total}]" if args.total > 0 else "validation")
latency = time.time() - start_time
print(f"Dataset loaded in {latency:.1f} seconds")

print("Evaluating squad_v2 with ORT. It might take a few minutes...")
start_time = time.time()
result = task_evaluator.compute(
model_or_pipeline=qa_pipeline,
data=squad_dataset,
metric="squad_v2",
squad_v2_format=True,
)
latency = time.time() - start_time
print(f"Evaluation done in {latency:.1f} seconds")

result["provider"] = args.provider
result["disable_fused_attention"] = disable_fused_attention
Expand Down

0 comments on commit c695de9

Please sign in to comment.