From 85e6fff70e9d74b2fbd38273636654cf8328d90f Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Wed, 18 Oct 2023 23:51:53 +0200 Subject: [PATCH] [ORT Training] Some important updates of ONNX Runtime training APIs (#1335) * update trainer * update args * update to main * update to 4.33 * fix style * make style * fix when testing * Update optimum/onnxruntime/trainer_seq2seq.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> * deprecate ort inf * deprectae ort inf for seq2seq * update trainer and its args to main * try CI permission * update tests * update examples * withdraw CI change --------- Co-authored-by: JingyaHuang Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- .../docker/Dockerfile-ort-nightly-cu118 | 3 +- .../docker/Dockerfile-ort1.14.1-cu116 | 2 +- .../docker/Dockerfile-ort1.15.1-cu118 | 2 +- .../docker/Dockerfile-ort1.16.1-cu118 | 2 +- .../run_image_classification.py | 69 +- .../training/language-modeling/run_clm.py | 190 ++-- .../training/language-modeling/run_mlm.py | 194 ++-- .../training/question-answering/run_qa.py | 85 +- .../training/question-answering/trainer_qa.py | 73 +- .../summarization/run_summarization.py | 151 +-- .../onnxruntime/training/test_examples.py | 174 ---- .../training/text-classification/run_glue.py | 97 +- .../training/token-classification/run_ner.py | 78 +- .../training/translation/run_translation.py | 98 +- optimum/onnxruntime/trainer.py | 952 ++---------------- optimum/onnxruntime/trainer_seq2seq.py | 691 ++----------- optimum/onnxruntime/training_args.py | 108 +- .../docker/Dockerfile_onnxruntime_trainer | 7 +- .../training/nightly_test_examples.py | 219 ++++ .../{ => training}/nightly_test_trainer.py | 693 +++++-------- 20 files changed, 1340 insertions(+), 2548 deletions(-) delete mode 100644 examples/onnxruntime/training/test_examples.py create mode 100644 tests/onnxruntime/training/nightly_test_examples.py rename tests/onnxruntime/{ => training}/nightly_test_trainer.py (54%) diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 index 668e5e5669..3e6841453b 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 @@ -22,6 +22,7 @@ CMD nvidia-smi ENV DEBIAN_FRONTEND noninteractive # Versions +# available options 3.8, 3.9, 3.10, 3.11 ARG PYTHON_VERSION=3.9 ARG TORCH_CUDA_VERSION=cu118 ARG TORCH_VERSION=2.0.0 @@ -34,7 +35,7 @@ SHELL ["/bin/bash", "-c"] # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 index db2219b5c6..15df7c352f 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 @@ -33,7 +33,7 @@ ARG TORCHVISION_VERSION=0.14.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 index 51c9ec514c..2d1306e1a3 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 @@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 index 3f6b833592..482d495fcb 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 @@ -34,7 +34,7 @@ SHELL ["/bin/bash", "-c"] # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/image-classification/run_image_classification.py b/examples/onnxruntime/training/image-classification/run_image_classification.py index 837cb57a4b..ec8de0b52d 100644 --- a/examples/onnxruntime/training/image-classification/run_image_classification.py +++ b/examples/onnxruntime/training/image-classification/run_image_classification.py @@ -16,6 +16,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -54,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") @@ -141,12 +142,28 @@ class ModelArguments: metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -162,32 +179,24 @@ def collate_fn(examples): return {"pixel_values": pixel_values, "labels": labels} -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -200,6 +209,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) @@ -209,7 +222,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -238,7 +251,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, task="image-classification", - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -285,7 +298,8 @@ def compute_metrics(p): finetuning_task="image-classification", cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForImageClassification.from_pretrained( model_args.model_name_or_path, @@ -293,14 +307,16 @@ def compute_metrics(p): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) image_processor = AutoImageProcessor.from_pretrained( model_args.image_processor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # Define torchvision transforms to be applied to each image. @@ -367,7 +383,6 @@ def val_transforms(example_batch): compute_metrics=compute_metrics, tokenizer=image_processor, data_collator=collate_fn, - feature="image-classification", ) # Training @@ -385,7 +400,7 @@ def val_transforms(example_batch): # Evaluation if training_args.do_eval: - metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) diff --git a/examples/onnxruntime/training/language-modeling/run_clm.py b/examples/onnxruntime/training/language-modeling/run_clm.py index 2807d3f721..bd9694ae41 100644 --- a/examples/onnxruntime/training/language-modeling/run_clm.py +++ b/examples/onnxruntime/training/language-modeling/run_clm.py @@ -24,12 +24,14 @@ import math import os import sys +import warnings from dataclasses import dataclass, field from itertools import chain from typing import Optional import datasets import evaluate +import torch import transformers from datasets import load_dataset from transformers import ( @@ -52,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -73,7 +75,7 @@ class ModelArguments: default=None, metadata={ "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." ) }, ) @@ -108,12 +110,47 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + ) + }, + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " + "dtype will be automatically derived from the model's weights." + ), + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + low_cpu_mem_usage: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " + "set True will benefit LLM loading time and RAM consumption." ) }, ) @@ -160,7 +197,7 @@ class DataTrainingArguments: ) }, ) - + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) block_size: Optional[int] = field( default=None, metadata={ @@ -189,6 +226,9 @@ class DataTrainingArguments: ) def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + if self.dataset_name is None and self.train_file is None and self.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: @@ -200,32 +240,24 @@ def __post_init__(self): assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -238,6 +270,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -248,7 +284,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -285,7 +321,8 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -293,14 +330,16 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) else: data_files = {} @@ -321,7 +360,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. @@ -331,7 +370,7 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) raw_datasets["train"] = load_dataset( @@ -339,7 +378,7 @@ def main(): data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) @@ -355,7 +394,8 @@ def main(): config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) @@ -373,7 +413,8 @@ def main(): "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) @@ -381,32 +422,44 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: + torch_dtype = ( + model_args.torch_dtype + if model_args.torch_dtype in ["auto", None] + else getattr(torch, model_args.torch_dtype) + ) model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + torch_dtype=torch_dtype, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: - model = AutoModelForCausalLM.from_config(config) + model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: - column_names = raw_datasets["train"].column_names + column_names = list(raw_datasets["train"].features) else: - column_names = raw_datasets["validation"].column_names + column_names = list(raw_datasets["validation"].features) text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function @@ -424,27 +477,34 @@ def tokenize_function(examples): return output with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) @@ -454,10 +514,9 @@ def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] @@ -471,16 +530,22 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) + if not data_args.streaming: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) if training_args.do_train: if "train" not in tokenized_datasets: @@ -528,7 +593,6 @@ def compute_metrics(eval_preds): preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None, - feature="text-generation", ) # Training @@ -556,7 +620,7 @@ def compute_metrics(eval_preds): if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) diff --git a/examples/onnxruntime/training/language-modeling/run_mlm.py b/examples/onnxruntime/training/language-modeling/run_mlm.py index 122395a1cd..3365ca8703 100755 --- a/examples/onnxruntime/training/language-modeling/run_mlm.py +++ b/examples/onnxruntime/training/language-modeling/run_mlm.py @@ -25,6 +25,7 @@ import math import os import sys +import warnings from dataclasses import dataclass, field from itertools import chain from typing import Optional @@ -52,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -71,7 +72,7 @@ class ModelArguments: default=None, metadata={ "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." ) }, ) @@ -106,12 +107,37 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + ) + }, + ) + low_cpu_mem_usage: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " + "set True will benefit LLM loading time and RAM consumption." ) }, ) @@ -196,8 +222,12 @@ class DataTrainingArguments: ) }, ) + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + if self.dataset_name is None and self.train_file is None and self.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: @@ -211,32 +241,24 @@ def __post_init__(self): raise ValueError("`validation_file` should be a csv, a json or a txt file.") -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -249,6 +271,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -259,7 +285,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") @@ -297,7 +323,8 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -305,14 +332,16 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + streaming=data_args.streaming, ) else: data_files = {} @@ -328,7 +357,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. @@ -338,14 +367,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at @@ -359,7 +388,8 @@ def main(): config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) @@ -377,7 +407,8 @@ def main(): "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) @@ -385,7 +416,7 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) @@ -396,34 +427,41 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: logger.info("Training new model from scratch") - model = AutoModelForMaskedLM.from_config(config) + model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: - column_names = raw_datasets["train"].column_names + column_names = list(raw_datasets["train"].features) else: - column_names = raw_datasets["validation"].column_names + column_names = list(raw_datasets["validation"].features) text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) @@ -448,14 +486,21 @@ def tokenize_function(examples): ) with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=[text_column_name], + ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more @@ -464,14 +509,21 @@ def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. @@ -479,10 +531,9 @@ def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= max_seq_length: - total_length = (total_length // max_seq_length) * max_seq_length + # We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] @@ -495,16 +546,22 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) + if not data_args.streaming: + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) + else: + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) if training_args.do_train: if "train" not in tokenized_datasets: @@ -563,7 +620,6 @@ def compute_metrics(eval_preds): preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None, - feature="fill-mask", ) # Training @@ -590,7 +646,7 @@ def compute_metrics(eval_preds): if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) diff --git a/examples/onnxruntime/training/question-answering/run_qa.py b/examples/onnxruntime/training/question-answering/run_qa.py index ec93d45c57..08b581a1a8 100644 --- a/examples/onnxruntime/training/question-answering/run_qa.py +++ b/examples/onnxruntime/training/question-answering/run_qa.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -49,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -79,12 +80,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -214,32 +231,24 @@ def __post_init__(self): assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -252,6 +261,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -262,7 +275,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -299,13 +312,14 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] + if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] @@ -317,7 +331,7 @@ def main(): data_files=data_files, field="data", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -331,14 +345,16 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, @@ -346,7 +362,8 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # Tokenizer check: this script requires a fast tokenizer. @@ -374,7 +391,7 @@ def main(): if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) @@ -599,12 +616,12 @@ def post_processing_function(examples, features, predictions, stage="eval"): # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()] - references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad") @@ -623,7 +640,6 @@ def compute_metrics(p: EvalPrediction): data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, - feature="question-answering", ) # Training @@ -649,7 +665,7 @@ def compute_metrics(p: EvalPrediction): # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) @@ -660,10 +676,7 @@ def compute_metrics(p: EvalPrediction): # Prediction if training_args.do_predict: logger.info("*** Predict ***") - results = trainer.predict( - predict_dataset, predict_examples, inference_with_ort=inference_args.inference_with_ort - ) - + results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = ( diff --git a/examples/onnxruntime/training/question-answering/trainer_qa.py b/examples/onnxruntime/training/question-answering/trainer_qa.py index 695ca92927..26ea820ace 100644 --- a/examples/onnxruntime/training/question-answering/trainer_qa.py +++ b/examples/onnxruntime/training/question-answering/trainer_qa.py @@ -15,7 +15,10 @@ """ A subclass of `ORTTrainer` specific to Question-Answering tasks """ -from transformers.trainer_utils import PredictionOutput +import math +import time + +from transformers.trainer_utils import PredictionOutput, speed_metrics from optimum.onnxruntime import ORTTrainer @@ -26,14 +29,7 @@ def __init__(self, *args, eval_examples=None, post_process_function=None, **kwar self.eval_examples = eval_examples self.post_process_function = post_process_function - def evaluate( - self, - eval_dataset=None, - eval_examples=None, - ignore_keys=None, - metric_key_prefix: str = "eval", - inference_with_ort: bool = False, - ): + def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset eval_dataloader = self.get_eval_dataloader(eval_dataset) eval_examples = self.eval_examples if eval_examples is None else eval_examples @@ -41,11 +37,8 @@ def evaluate( # Temporarily disable metric computation, we will do it in the loop here. compute_metrics = self.compute_metrics self.compute_metrics = None - if inference_with_ort: - eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort - else: - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + start_time = time.time() try: output = eval_loop( eval_dataloader, @@ -54,11 +47,23 @@ def evaluate( # self.args.prediction_loss_only prediction_loss_only=True if compute_metrics is None else None, ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, ) finally: self.compute_metrics = compute_metrics - - if self.post_process_function is not None and self.compute_metrics is not None: + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save: + # Only the main node write the results by default eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) metrics = self.compute_metrics(eval_preds) @@ -66,31 +71,25 @@ def evaluate( for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + metrics.update(output.metrics) + else: + metrics = output.metrics + if self.args.should_log: + # Only the main node log the results by default self.log(metrics) - else: - metrics = {} self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) return metrics - def predict( - self, - predict_dataset, - predict_examples, - ignore_keys=None, - metric_key_prefix: str = "test", - inference_with_ort: bool = False, - ): + def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): predict_dataloader = self.get_test_dataloader(predict_dataset) # Temporarily disable metric computation, we will do it in the loop here. compute_metrics = self.compute_metrics self.compute_metrics = None - if inference_with_ort: - eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort - else: - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + start_time = time.time() try: output = eval_loop( predict_dataloader, @@ -99,9 +98,21 @@ def predict( # self.args.prediction_loss_only prediction_loss_only=True if compute_metrics is None else None, ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, ) finally: self.compute_metrics = compute_metrics + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) if self.post_process_function is None or self.compute_metrics is None: return output @@ -113,5 +124,5 @@ def predict( for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - + metrics.update(output.metrics) return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) diff --git a/examples/onnxruntime/training/summarization/run_summarization.py b/examples/onnxruntime/training/summarization/run_summarization.py index d1264489d8..83ec61f225 100644 --- a/examples/onnxruntime/training/summarization/run_summarization.py +++ b/examples/onnxruntime/training/summarization/run_summarization.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -51,7 +52,7 @@ # Might have error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -98,12 +99,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -187,7 +204,7 @@ class DataTrainingArguments: metadata={ "help": ( "The maximum total sequence length for validation target text after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. " "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " "during ``evaluate`` and ``predict``." ) @@ -246,14 +263,14 @@ class DataTrainingArguments: }, ) source_prefix: Optional[str] = field( - default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} + default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."} ) forced_bos_token: Optional[str] = field( default=None, metadata={ "help": ( - "The token to force as the first generated token after the decoder_start_token_id." + "The token to force as the first generated token after the decoder_start_token_id. " "Useful for multilingual models like mBART where the first generated token" "needs to be the target language token (Usually it is the target language token)" ) @@ -261,8 +278,13 @@ class DataTrainingArguments: ) def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training, validation, or test file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] @@ -270,22 +292,13 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." if self.val_max_target_length is None: self.val_max_target_length = self.max_target_length -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - summarization_name_mapping = { "amazon_reviews_multi": ("review_body", "review_title"), "big_patent": ("description", "abstract"), @@ -307,15 +320,19 @@ def main(): # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -327,6 +344,11 @@ def main(): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) + + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -337,7 +359,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -386,7 +408,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -403,7 +425,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -417,14 +439,16 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, @@ -432,11 +456,10 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - model.resize_token_embeddings(len(tokenizer)) - if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang] @@ -471,10 +494,16 @@ def main(): # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") column_names = raw_datasets["train"].column_names elif training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") column_names = raw_datasets["validation"].column_names elif training_args.do_predict: + if "test" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") column_names = raw_datasets["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") @@ -520,7 +549,7 @@ def main(): if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( - "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for " f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) @@ -550,8 +579,6 @@ def preprocess_function(examples): return model_inputs if training_args.do_train: - if "train" not in raw_datasets: - raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) @@ -568,8 +595,6 @@ def preprocess_function(examples): if training_args.do_eval: max_target_length = data_args.val_max_target_length - if "validation" not in raw_datasets: - raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) @@ -586,8 +611,6 @@ def preprocess_function(examples): if training_args.do_predict: max_target_length = data_args.val_max_target_length - if "test" not in raw_datasets: - raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) @@ -628,10 +651,10 @@ def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -643,6 +666,16 @@ def compute_metrics(eval_preds): result["gen_len"] = np.mean(prediction_lens) return result + # Override the decoding parameters of Seq2SeqTrainer + training_args.generation_max_length = ( + training_args.generation_max_length + if training_args.generation_max_length is not None + else data_args.val_max_target_length + ) + training_args.generation_num_beams = ( + data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams + ) + # Initialize our Trainer trainer = ORTSeq2SeqTrainer( model=model, @@ -652,7 +685,6 @@ def compute_metrics(eval_preds): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, - feature="text2text-generation", ) # Training @@ -677,20 +709,15 @@ def compute_metrics(eval_preds): # Evaluation results = {} - max_length = ( - training_args.generation_max_length - if training_args.generation_max_length is not None - else data_args.val_max_target_length - ) - num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate( - max_length=max_length, - num_beams=num_beams, - metric_key_prefix="eval", - inference_with_ort=inference_args.inference_with_ort, - ) + if isinstance(eval_dataset, dict): + metrics = {} + for eval_ds_name, eval_ds in eval_dataset.items(): + dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}") + metrics.update(dataset_metrics) + else: + metrics = trainer.evaluate(metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) @@ -700,13 +727,7 @@ def compute_metrics(eval_preds): if training_args.do_predict: logger.info("*** Predict ***") - predict_results = trainer.predict( - predict_dataset, - metric_key_prefix="predict", - max_length=max_length, - num_beams=num_beams, - inference_with_ort=inference_args.inference_with_ort, - ) + predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict") metrics = predict_results.metrics max_predict_samples = ( data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) @@ -718,8 +739,10 @@ def compute_metrics(eval_preds): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") diff --git a/examples/onnxruntime/training/test_examples.py b/examples/onnxruntime/training/test_examples.py deleted file mode 100644 index 8fe1de53d5..0000000000 --- a/examples/onnxruntime/training/test_examples.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import os -import sys -import unittest -from unittest.mock import patch - -import torch -from transformers.file_utils import is_apex_available -from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device - - -SRC_DIRS = [ - os.path.join(os.path.dirname(__file__), dirname) - for dirname in [ - "text-classification", - "token-classification", - "question-answering", - "translation", - ] -] -sys.path.extend(SRC_DIRS) -if SRC_DIRS is not None: - import run_glue - import run_ner - import run_qa - import run_translation - - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger() - - -def get_results(output_dir): - results = {} - path = os.path.join(output_dir, "all_results.json") - if os.path.exists(path): - with open(path, "r") as f: - results = json.load(f) - else: - raise ValueError(f"can't find {path}") - return results - - -def is_cuda_and_apex_available(): - is_using_cuda = torch.cuda.is_available() and torch_device == "cuda" - return is_using_cuda and is_apex_available() - - -class ExamplesTests(TestCasePlus): - # Text Classification Tests - def test_run_glue(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_glue.py - --model_name_or_path bert-base-uncased - --task_name sst2 - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - """.split() - - with patch.object(sys, "argv", testargs): - run_glue.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_accuracy"], 0.75) - - # Token Classification Tests - def test_run_ner(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu - epochs = 7 if get_gpu_count() > 1 else 2 - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_ner.py - --model_name_or_path bert-base-uncased - --dataset_name conll2003 - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - --num_train_epochs={epochs} - """.split() - - with patch.object(sys, "argv", testargs): - run_ner.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_accuracy"], 0.75) - self.assertLess(result["eval_loss"], 0.5) - - # Question Answering Tests - def test_run_qa(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_qa.py - --model_name_or_path bert-base-uncased - --dataset_name squad - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - """.split() - - with patch.object(sys, "argv", testargs): - run_qa.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_f1"], 30) - self.assertGreaterEqual(result["eval_exact"], 30) - - @slow - def test_run_translation(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_translation.py - --model_name_or_path t5-large - --source_lang en - --target_lang ro - --dataset_name wmt16 - --output_dir {tmp_dir} - --overwrite_output_dir - --max_steps=50 - --warmup_steps=8 - --do_train - --learning_rate=3e-3 - --per_device_train_batch_size=2 - --per_device_eval_batch_size=1 - --predict_with_generate - """.split() - - with patch.object(sys, "argv", testargs): - run_translation.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_bleu"], 30) - - -if __name__ == "__main__": - unittest.main() diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py index 7a81a2ff15..f3f04657af 100644 --- a/examples/onnxruntime/training/text-classification/run_glue.py +++ b/examples/onnxruntime/training/text-classification/run_glue.py @@ -21,6 +21,7 @@ import os import random import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -48,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -188,12 +189,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -203,32 +220,24 @@ class ModelArguments: ) -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -241,6 +250,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -291,7 +304,7 @@ def main(): "glue", data_args.task_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. @@ -299,7 +312,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from your local files. @@ -328,7 +341,7 @@ def main(): "csv", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from local json files @@ -336,7 +349,7 @@ def main(): "json", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -371,14 +384,16 @@ def main(): finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, @@ -386,7 +401,8 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -440,7 +456,7 @@ def main(): if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) @@ -496,6 +512,8 @@ def preprocess_function(examples): # Get the metric function if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) + elif is_regression: + metric = evaluate.load("mse") else: metric = evaluate.load("accuracy") @@ -504,17 +522,12 @@ def preprocess_function(examples): def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - if data_args.task_name is not None: - result = metric.compute(predictions=preds, references=p.label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - elif is_regression: - return {"mse": ((preds - p.label_ids) ** 2).mean().item()} - else: - return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result - # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to ORTTrainer, so we change it if + # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if # we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator @@ -532,7 +545,6 @@ def compute_metrics(p: EvalPrediction): compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, - feature="text-classification", ) # Training @@ -550,6 +562,7 @@ def compute_metrics(p: EvalPrediction): metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -571,7 +584,7 @@ def compute_metrics(p: EvalPrediction): combined = {} for eval_dataset, task in zip(eval_datasets, tasks): - metrics = trainer.evaluate(eval_dataset=eval_dataset, inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) @@ -599,9 +612,7 @@ def compute_metrics(p: EvalPrediction): for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset = predict_dataset.remove_columns("label") - predictions = trainer.predict( - predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort - ).predictions + predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") diff --git a/examples/onnxruntime/training/token-classification/run_ner.py b/examples/onnxruntime/training/token-classification/run_ner.py index 80366f07ad..55ddfa2cf0 100644 --- a/examples/onnxruntime/training/token-classification/run_ner.py +++ b/examples/onnxruntime/training/token-classification/run_ner.py @@ -22,6 +22,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -49,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") @@ -79,12 +80,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -204,32 +221,24 @@ def __post_init__(self): self.task_name = self.task_name.lower() -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -242,6 +251,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -252,7 +265,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -289,7 +302,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -358,7 +371,8 @@ def get_label_list(labels): finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path @@ -368,7 +382,8 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, add_prefix_space=True, ) else: @@ -377,7 +392,8 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForTokenClassification.from_pretrained( @@ -386,7 +402,8 @@ def get_label_list(labels): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -567,7 +584,6 @@ def compute_metrics(p): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, - feature="token-classification", ) # Training @@ -580,6 +596,7 @@ def compute_metrics(p): train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload + max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) @@ -593,7 +610,7 @@ def compute_metrics(p): if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) @@ -605,10 +622,7 @@ def compute_metrics(p): if training_args.do_predict: logger.info("*** Predict ***") - predictions, labels, metrics = trainer.predict( - predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort - ) - + predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) diff --git a/examples/onnxruntime/training/translation/run_translation.py b/examples/onnxruntime/training/translation/run_translation.py index e410454f2f..0b6a36d12f 100644 --- a/examples/onnxruntime/training/translation/run_translation.py +++ b/examples/onnxruntime/training/translation/run_translation.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -52,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") @@ -89,12 +90,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -156,7 +173,7 @@ class DataTrainingArguments: metadata={ "help": ( "The maximum total sequence length for validation target text after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. " "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " "during ``evaluate`` and ``predict``." ) @@ -248,32 +265,24 @@ def __post_init__(self): self.val_max_target_length = self.max_target_length -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -286,6 +295,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -296,7 +309,7 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -345,7 +358,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -362,7 +375,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -376,14 +389,16 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, @@ -391,11 +406,10 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - model.resize_token_embeddings(len(tokenizer)) - # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): @@ -448,7 +462,7 @@ def main(): if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( - "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for " f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) @@ -549,10 +563,10 @@ def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -575,7 +589,6 @@ def compute_metrics(eval_preds): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, - feature="text2text-generation", ) # Training @@ -609,12 +622,7 @@ def compute_metrics(eval_preds): if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate( - max_length=max_length, - num_beams=num_beams, - metric_key_prefix="eval", - inference_with_ort=inference_args.inference_with_ort, - ) + metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) @@ -625,11 +633,7 @@ def compute_metrics(eval_preds): logger.info("*** Predict ***") predict_results = trainer.predict( - predict_dataset, - metric_key_prefix="predict", - max_length=max_length, - num_beams=num_beams, - inference_with_ort=inference_args.inference_with_ort, + predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams ) metrics = predict_results.metrics max_predict_samples = ( @@ -642,8 +646,10 @@ def compute_metrics(eval_preds): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 89363b6b26..afc90e405b 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -22,8 +22,7 @@ import time import types import warnings -from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union # Integrations must be imported before ML frameworks: @@ -49,43 +48,27 @@ # isort: on -import numpy as np +import huggingface_hub.utils as hf_hub_utils import torch import torch.distributed as dist from torch import nn -from torch.utils.data import DataLoader, Dataset +from torch.utils.data import Dataset, RandomSampler from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled -from transformers.file_utils import ( - is_apex_available, - is_sagemaker_dp_enabled, - is_sagemaker_mp_enabled, - is_torch_tpu_available, -) from transformers.modeling_utils import PreTrainedModel, unwrap_model from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import Trainer from transformers.trainer_callback import TrainerCallback, TrainerState from transformers.trainer_pt_utils import ( - DistributedTensorGatherer, - IterableDatasetShard, - SequentialDistributedSampler, - find_batch_size, get_model_param_count, get_module_class_from_name, get_parameter_names, - nested_concat, - nested_detach, - nested_numpify, ) from transformers.trainer_utils import ( - EvalLoopOutput, EvalPrediction, HPSearchBackend, - PredictionOutput, TrainOutput, - denumpify_detensorize, enable_full_determinism, find_executable_batch_size, get_last_checkpoint, @@ -94,31 +77,17 @@ speed_metrics, ) from transformers.training_args import ParallelMode +from transformers.utils import ( + is_apex_available, + is_sagemaker_dp_enabled, + is_sagemaker_mp_enabled, + is_torch_tpu_available, +) -from ..exporters import TasksManager -from ..exporters.onnx import OnnxConfigWithPast, export, export_models, get_decoder_models_for_export from ..utils import logging -from .modeling_decoder import ORTModelForCausalLM -from .modeling_ort import ( - ORTModel, - ORTModelForCustomTasks, - ORTModelForFeatureExtraction, - ORTModelForImageClassification, - ORTModelForMaskedLM, - ORTModelForMultipleChoice, - ORTModelForQuestionAnswering, - ORTModelForSemanticSegmentation, - ORTModelForSequenceClassification, - ORTModelForTokenClassification, -) -from .modeling_seq2seq import ORTModelForSeq2SeqLM, ORTModelForSpeechSeq2Seq from .training_args import ORTOptimizerNames, ORTTrainingArguments from .utils import ( - ONNX_DECODER_NAME, - ONNX_DECODER_WITH_PAST_NAME, - ONNX_WEIGHTS_NAME, is_onnxruntime_training_available, - wrap_onnx_config_for_loss, ) @@ -132,14 +101,10 @@ import optuna -logger = logging.get_logger(__name__) - # Name of the files used for checkpointing -TRAINING_ARGS_NAME = "training_args.bin" TRAINER_STATE_NAME = "trainer_state.json" -OPTIMIZER_NAME = "optimizer.pt" -SCHEDULER_NAME = "scheduler.pt" -SCALER_NAME = "scaler.pt" + +logger = logging.get_logger(__name__) class ModuleWithLoss(nn.Module): @@ -168,44 +133,6 @@ def config(self): return self._original_model.config -class ORTFeaturesManager: - _TASKS_TO_ORTMODELS = { - "feature-extraction": ORTModelForFeatureExtraction, - "fill-mask": ORTModelForMaskedLM, - "text-generation": ORTModelForCausalLM, - "text-generation-with-past": ORTModelForCausalLM, - "text2text-generation": ORTModelForSeq2SeqLM, - "text2text-generation-with-past": ORTModelForSeq2SeqLM, - "text-classification": ORTModelForSequenceClassification, - "token-classification": ORTModelForTokenClassification, - "multiple-choice": ORTModelForMultipleChoice, - "question-answering": ORTModelForQuestionAnswering, - "image-classification": ORTModelForImageClassification, - "semantic-segmentation": ORTModelForSemanticSegmentation, - "automatic-speech-recognition": ORTModelForSpeechSeq2Seq, - } - - SUPPORTED_FEATURES = _TASKS_TO_ORTMODELS.keys() - - @staticmethod - def get_model_class_for_feature(feature: str) -> Type: - """ - Gets the subclass of `ORTModel` associated with the feature. - """ - - return ORTFeaturesManager._TASKS_TO_ORTMODELS[feature] - - @staticmethod - def do_use_cache(feature: str) -> bool: - """ - Gets the value of `use_cache` for the feature. - """ - if "-with-past" in feature: - return True - else: - return False - - class ORTTrainer(Trainer): """ ORTTrainer is a simple but feature-complete training and eval loop for ONNX Runtime, optimized for 🤗 Transformers. @@ -286,18 +213,16 @@ class ORTTrainer(Trainer): def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - feature: str = "feature-extraction", args: ORTTrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, - onnx_model_path: Union[str, os.PathLike] = None, ): super().__init__( model=model, @@ -323,9 +248,6 @@ def __init__( self.model = model - self.feature = feature - self.onnx_model_path = onnx_model_path - self.exported_with_loss = False if self.args.local_rank: torch.cuda.set_device(self.args.local_rank) @@ -437,7 +359,12 @@ def train( if resume_from_checkpoint is None: raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") - if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and args.deepspeed is None: + if ( + resume_from_checkpoint is not None + and not is_sagemaker_mp_enabled() + and not self.is_deepspeed_enabled + and not self.is_fsdp_enabled + ): self._load_from_checkpoint(resume_from_checkpoint) # If model was re-initialized, put it on the right device and update self.model_wrapped @@ -449,12 +376,25 @@ def train( inner_training_loop = find_executable_batch_size( self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size ) - return inner_training_loop( - args=args, - resume_from_checkpoint=resume_from_checkpoint, - trial=trial, - ignore_keys_for_eval=ignore_keys_for_eval, - ) + if args.push_to_hub: + try: + # Disable progress bars when uploading models during checkpoints to avoid polluting stdout + hf_hub_utils.disable_progress_bars() + return inner_training_loop( + args=args, + resume_from_checkpoint=resume_from_checkpoint, + trial=trial, + ignore_keys_for_eval=ignore_keys_for_eval, + ) + finally: + hf_hub_utils.enable_progress_bars() + else: + return inner_training_loop( + args=args, + resume_from_checkpoint=resume_from_checkpoint, + trial=trial, + ignore_keys_for_eval=ignore_keys_for_eval, + ) def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None @@ -504,14 +444,6 @@ def _inner_training_loop( f" {args.max_steps}" ) - # Compute absolute values for logging, eval, and save if given as ratio - if args.logging_steps and args.logging_steps < 1: - args.logging_steps = math.ceil(max_steps * args.logging_steps) - if args.eval_steps and args.eval_steps < 1: - args.eval_steps = math.ceil(max_steps * args.eval_steps) - if args.save_steps and args.save_steps < 1: - args.save_steps = math.ceil(max_steps * args.save_steps) - if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if self.args.n_gpu > 1: # nn.DataParallel(model) replicates the model, creating new variables and module @@ -556,13 +488,30 @@ def _inner_training_loop( self.state = TrainerState() self.state.is_hyper_param_search = trial is not None + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + # Activate gradient checkpointing if needed if args.gradient_checkpointing: self.model.gradient_checkpointing_enable() model = self._wrap_model(self.model_wrapped) # Wrap unless the ORTModule is already wrapped, eg. wrap DDP - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: + if (is_sagemaker_mp_enabled() or self.is_fsdp_enabled) and resume_from_checkpoint is not None: self._load_from_checkpoint(resume_from_checkpoint, model) # as the model is wrapped, don't use `accelerator.prepare` @@ -688,11 +637,27 @@ def _inner_training_loop( self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + # Temp: remove after transformers 4.34 release + def get_dataloader_sampler(dataloader): + if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None: + return get_dataloader_sampler(dataloader.batch_sampler) + elif hasattr(dataloader, "sampler"): + return dataloader.sampler + # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - for _ in train_dataloader: - break + sampler = get_dataloader_sampler(train_dataloader) + is_random_sampler = isinstance(sampler, RandomSampler) + if not is_random_sampler: + # We just need to begin an iteration to create the randomization of the sampler. + for _ in train_dataloader: + break + else: + # Otherwise we need to call the whooooole sampler cause there is some random operation added + # AT THE VERY END! + sampler = sampler if sampler is not None else [] + _ = list(sampler) total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): @@ -703,7 +668,7 @@ def _inner_training_loop( self._past = None steps_in_epoch = ( - len(train_dataloader) + len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps ) @@ -715,13 +680,13 @@ def _inner_training_loop( rng_to_sync = False steps_skipped = 0 if steps_trained_in_current_epoch > 0: - skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) steps_skipped = steps_trained_in_current_epoch steps_trained_in_current_epoch = 0 rng_to_sync = True step = -1 - for step, inputs in enumerate(train_dataloader): + for step, inputs in enumerate(epoch_iterator): total_batched_samples += 1 if rng_to_sync: self._load_rng_state(resume_from_checkpoint) @@ -864,759 +829,16 @@ def _inner_training_loop( # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: for checkpoint in checkpoints_sorted: - if checkpoint != self.state.best_model_checkpoint: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") shutil.rmtree(checkpoint) self.control = self.callback_handler.on_train_end(args, self.state, self.control) - return TrainOutput(self.state.global_step, train_loss, metrics) - - def evaluate( - self, - eval_dataset: Optional[Dataset] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - inference_with_ort: bool = False, - ) -> Dict[str, float]: - """ - Run evaluation with ONNX Runtime or PyTorch backend and returns metrics. - - Args: - eval_dataset (`Dataset`, *optional*): - Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns - not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` - method. - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is "eval" (default) - - Returns: - A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The - dictionary also contains the epoch number which comes from the training state. - """ - # memory metrics - must set up as early as possible - # TODO: We need to enable evaluation using ORT backend. - if self.args.use_module_with_loss: - self.model = self.model._original_model - self._memory_tracker.start() - - eval_dataloader = self.get_eval_dataloader(eval_dataset) - start_time = time.time() - - if inference_with_ort: - logger.info("[INFO] Evaluating with ONNX Runtime backend.") - eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort - else: - logger.info( - "[INFO] Evaluating with PyTorch backend. If you want to use ONNX Runtime for the evaluation, set `trainer.evaluate(inference_with_ort=True)`." - ) - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - - try: - output = eval_loop( - eval_dataloader, - description="Evaluation", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=True if self.compute_metrics is None else None, - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, - ) - except Exception as error: - logger.error(error) - if inference_with_ort: - logger.error( - f"[ERROR!] Evaluation with ONNX Runtime is not available for {self.model.config.name_or_path} model. Set `inference_with_ort=False` to evaluate with PyTorch." - ) - raise - - total_batch_size = self.args.eval_batch_size * self.args.world_size - if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: - start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] - output.metrics.update( - speed_metrics( - metric_key_prefix, - start_time, - num_samples=output.num_samples, - num_steps=math.ceil(output.num_samples / total_batch_size), - ) - ) - - self.log(output.metrics) - - self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) - - self._memory_tracker.stop_and_update_metrics(output.metrics) - - return output.metrics - - def predict( - self, - test_dataset: Dataset, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "test", - inference_with_ort: bool = False, - ) -> PredictionOutput: - """ - Run prediction and returns predictions and potential metrics. - - Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method - will also return metrics, like in `evaluate()`. - - Args: - test_dataset (`Dataset`): - Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the - `model.forward()` method are automatically removed. Has to implement the method `__len__` - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"test"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "test_bleu" if the prefix is "test" (default) - - - - If your predictions or labels have different sequence length (for instance because you're doing dynamic padding - in a token classification task) the predictions will be padded (on the right) to allow for concatenation into - one array. The padding index is -100. - - - - Returns: *NamedTuple* A namedtuple with the following keys: - - - predictions (`np.ndarray`): The predictions on `test_dataset`. - - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). - - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained - labels). - """ - # TODO: We need to enable evaluation using ORT backend. - if self.args.use_module_with_loss: - self.model = self.model._original_model - - # memory metrics - must set up as early as possible - self._memory_tracker.start() - - test_dataloader = self.get_test_dataloader(test_dataset) - start_time = time.time() - - if inference_with_ort: - logger.info("[INFO] Predicting with ONNX Runtime backend.") - eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort - else: - logger.info( - "[INFO] Predicting with PyTorch backend. If you want to use ONNX Runtime for the prediction, set `trainer.predict(inference_with_ort=True)`." - ) - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - - try: - output = eval_loop( - test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix - ) - except Exception as error: - logger.error(error) - if inference_with_ort: - logger.error( - f"[ERROR!] Prediction with ONNX Runtime is not available for {self.model.config.name_or_path} model. Set `inference_with_ort=False` to predict with PyTorch." - ) - raise - - total_batch_size = self.args.eval_batch_size * self.args.world_size - if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: - start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] - output.metrics.update( - speed_metrics( - metric_key_prefix, - start_time, - num_samples=output.num_samples, - num_steps=math.ceil(output.num_samples / total_batch_size), - ) - ) - - self._memory_tracker.stop_and_update_metrics(output.metrics) - - return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics) - - def evaluation_loop_ort( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> EvalLoopOutput: - """ - Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`. - - Works both with or without labels. - """ - logger.info("[INFO] ONNX Runtime inference starts...") - - # Check if there are labels in the dataset - dummy_inputs = next(iter(dataloader)) - has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names) - use_cache = ORTFeaturesManager.do_use_cache(self.feature) - - if self.onnx_model_path and (has_labels == self.exported_with_loss): - logger.info("[INFO] Inference with given ONNX model") - self.onnx_model_path = Path(self.onnx_model_path).as_posix() - else: - onnx_model_path = Path(self.args.output_dir) - - logger.info("[INFO] Exporting the model to ONNX...") - if self.args.deepspeed and self.args.fp16: - export_device = "cuda" - else: - export_device = "cpu" - - # With `label_smoother` the loss will be computed outside modeling - with_loss = has_labels and not self.label_smoother - self._export(onnx_model_path, with_loss=with_loss, device=export_device, use_cache=use_cache) - - self.exported_with_loss = with_loss - self.onnx_model_path = onnx_model_path.as_posix() - logger.info(f"[INFO] ONNX model is stored in: {self.onnx_model_path}") - - # Load ORT model - support_loss_in_modeling = self.feature in [ - "text-generation", - "text-generation-with-past", - "text2text-generation", - "text2text-generation-with-past", - ] - support_feature = self.feature in ORTFeaturesManager.SUPPORTED_FEATURES - if support_loss_in_modeling or (not self.exported_with_loss and support_feature): - # Exported with standard outputs, use specific ORTModels - ort_model_cls = ORTFeaturesManager.get_model_class_for_feature(self.feature) - else: - ort_model_cls = ORTModelForCustomTasks - - model_id = self.onnx_model_path - args = self.args - if ort_model_cls is ORTModelForCausalLM: - ort_model = ort_model_cls.from_pretrained(model_id=model_id, use_cache=use_cache).to(args.device) - else: - ort_model = ort_model_cls.from_pretrained(model_id=model_id).to(args.device) - - prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only - - batch_size = dataloader.batch_size - - logger.info(f"***** Running {description} *****") - if has_length(dataloader): - logger.info(f" Num examples = {self.num_examples(dataloader)}") - else: - logger.info(" Num examples: Unknown") - logger.info(f" Batch size = {batch_size}") - - self.callback_handler.eval_dataloader = dataloader - # Do this before wrapping. - eval_dataset = getattr(dataloader, "dataset", None) - - if args.past_index >= 0: - self._past = None - - # Initialize containers - # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) - losses_host = None - preds_host = None - labels_host = None - inputs_host = None - - # losses/preds/labels on CPU (final containers) - all_losses = None - all_preds = None - all_labels = None - all_inputs = None - # Will be useful when we have an iterable dataset so don't know its length. - - observed_num_examples = 0 - # Main evaluation loop - for step, inputs in enumerate(dataloader): - # Update the observed num examples - observed_batch_size = find_batch_size(inputs) - if observed_batch_size is not None: - observed_num_examples += observed_batch_size - # For batch samplers, batch_size is not known by the dataloader in advance. - if batch_size is None: - batch_size = observed_batch_size - - # Prediction step(send also onnxruntime inference session) - loss, logits, labels = self.prediction_step_ort( - ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys - ) - inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None - - # Update containers on host - if loss is not None: - losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size))) - losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100) - if labels is not None: - labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100) - if inputs_decode is not None: - inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100) - inputs_decode = self.accelerator.gather_for_metrics((inputs_decode)) - inputs_host = ( - inputs_decode - if inputs_host is None - else nested_concat(inputs_host, inputs_decode, padding_index=-100) - ) - if logits is not None: - logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100) - if self.preprocess_logits_for_metrics is not None: - logits = self.preprocess_logits_for_metrics(logits, labels) - logits = self.accelerator.gather_for_metrics((logits)) - preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) - - if labels is not None: - labels = self.accelerator.gather_for_metrics((labels)) - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - - self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and self.accelerator.sync_gradients: - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode - if all_inputs is None - else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = ( - labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - ) - - # Set back to None to begin a new accumulation - losses_host, preds_host, inputs_host, labels_host = None, None, None, None - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of the evaluation loop - delattr(self, "_past") - - # Gather all remaining tensors and put them back on the CPU - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - - # Number of samples - if has_length(eval_dataset): - num_samples = len(eval_dataset) - # The instance check is weird and does not actually check for the type, but whether the dataset has the right - # methods. Therefore we need to make sure it also has the attribute. - elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0: - num_samples = eval_dataset.num_examples - else: - if has_length(dataloader): - num_samples = self.num_examples(dataloader) - else: # both len(dataloader.dataset) and len(dataloader) fail - num_samples = observed_num_examples - if num_samples == 0 and observed_num_examples > 0: - num_samples = observed_num_examples - - # Metrics! - if self.compute_metrics is not None and all_preds is not None and all_labels is not None: - if args.include_inputs_for_metrics: - metrics = self.compute_metrics( - EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) - ) - else: - metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) - else: - metrics = {} - - # To be JSON-serializable, we need to remove numpy types or zero-d tensors - metrics = denumpify_detensorize(metrics) + # Wait for the checkpoint to be uploaded. + self._finish_current_push() - if all_losses is not None: - metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() - if hasattr(self, "jit_compilation_time"): - metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) - - def prediction_loop_ort( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> EvalLoopOutput: - """ - Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`. - - Works both with or without labels. - """ - logger.info("[INFO] ONNX Runtime inference starts...") - - # Check if there are labels in the dataset - dummy_inputs = next(iter(dataloader)) - has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names) - use_cache = ORTFeaturesManager.do_use_cache(self.feature) - - if self.onnx_model_path and (has_labels == self.exported_with_loss): - logger.info("[INFO] Inference with given ONNX model") - self.onnx_model_path = Path(self.onnx_model_path).as_posix() - else: - onnx_model_path = Path(self.args.output_dir) - - logger.info("[INFO] Exporting the model to ONNX...") - if self.args.deepspeed and self.args.fp16: - export_device = "cuda" - else: - export_device = "cpu" - - # With `label_smoother` the loss will be computed outside modeling - with_loss = has_labels and not self.label_smoother - self._export(onnx_model_path, with_loss=with_loss, device=export_device, use_cache=use_cache) - - self.exported_with_loss = with_loss - self.onnx_model_path = onnx_model_path.as_posix() - logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path) - - # Load ORT model - support_loss_in_modeling = self.feature in [ - "text-generation", - "text-generation-with-past", - "text2text-generation", - "text2text-generation-with-past", - ] - support_feature = self.feature in ORTFeaturesManager.SUPPORTED_FEATURES - if support_loss_in_modeling or (not self.exported_with_loss and support_feature): - # Exported with standard outputs, use specific ORTModels - ort_model_cls = ORTFeaturesManager.get_model_class_for_feature(self.feature) - else: - ort_model_cls = ORTModelForCustomTasks - - model_id = self.onnx_model_path - args = self.args - if ort_model_cls is ORTModelForCausalLM: - ort_model = ort_model_cls.from_pretrained(model_id=model_id, use_cache=use_cache).to(args.device) - else: - ort_model = ort_model_cls.from_pretrained(model_id=model_id).to(args.device) - - if not has_length(dataloader): - raise ValueError("dataloader must implement a working __len__") - - prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only - - batch_size = dataloader.batch_size - num_examples = self.num_examples(dataloader) - logger.info(f"***** Running {description} *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Batch size = {batch_size}") - losses_host: torch.Tensor = None - preds_host: Union[torch.Tensor, List[torch.Tensor]] = None - labels_host: Union[torch.Tensor, List[torch.Tensor]] = None - inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None - - world_size = max(1, args.world_size) - - eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) - if not prediction_loss_only: - # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass - # a batch size to the sampler) - make_multiple_of = None - if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): - make_multiple_of = dataloader.sampler.batch_size - preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - - if args.past_index >= 0: - self._past = None - - self.callback_handler.eval_dataloader = dataloader - - for step, inputs in enumerate(dataloader): - loss, logits, labels = self.prediction_step_ort( - ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys - ) - inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None - - if loss is not None: - losses = loss.repeat(batch_size) - losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) - if logits is not None: - preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) - if labels is not None: - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - if inputs_decode is not None: - inputs_host = ( - inputs_decode - if inputs_host is None - else nested_concat(inputs_host, inputs_decode, padding_index=-100) - ) - self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) - if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) - inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) - - # Set back to None to begin a new accumulation - losses_host, preds_host, labels_host, inputs_host = None, None, None, None - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of the evaluation loop - delattr(self, "_past") - - # Gather all remaining tensors and put them back on the CPU - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) - if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) - inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) - - eval_loss = eval_losses_gatherer.finalize() - preds = preds_gatherer.finalize() if not prediction_loss_only else None - label_ids = labels_gatherer.finalize() if not prediction_loss_only else None - inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None - - if self.compute_metrics is not None and preds is not None and label_ids is not None: - if args.include_inputs_for_metrics: - metrics = self.compute_metrics( - EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids) - ) - else: - metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) - else: - metrics = {} - - # To be JSON-serializable, we need to remove numpy types or zero-d tensors - metrics = denumpify_detensorize(metrics) - - if eval_loss is not None: - metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples) - - def prediction_step_ort( - self, - model: ORTModel, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on `model` using `inputs`. - - Args: - model (`ORTModel`): - The model to evaluate. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (`bool`): - Whether or not to return the loss only. - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - - Return: - Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, - logits and labels (each being optional). - """ - has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) - # For CLIP-like models capable of returning loss values. - # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` - # is `True` in `model.forward`. - return_loss = inputs.get("return_loss", None) - if return_loss is None: - return_loss = self.can_return_loss - loss_without_labels = True if len(self.label_names) == 0 and return_loss else False - - inputs = self._prepare_inputs(inputs) - - if ignore_keys is None: - if hasattr(self.model, "config"): - ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) - else: - ignore_keys = [] - - # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. - if has_labels or loss_without_labels: - labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) - if len(labels) == 1: - labels = labels[0] - else: - labels = None - - with torch.no_grad(): - if is_sagemaker_mp_enabled(): - raise NotImplementedError( - "Sagemaker's distributed data parallel features are not supported by `ORTTrainer` yet." - ) - else: - if has_labels or loss_without_labels: - with self.compute_loss_context_manager(): - loss, outputs = self.compute_loss_ort(model, inputs, return_outputs=True) - loss = loss.mean().detach() - - if isinstance(outputs, dict): - logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) - else: - logits = outputs[1:] - else: - loss = None - with self.compute_loss_context_manager(): - outputs = model(**inputs) - if isinstance(outputs, dict): - logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) - else: - logits = outputs - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index - 1] - - if prediction_loss_only: - return (loss, None, None) - - logits = nested_detach(logits) - if len(logits) == 1: - logits = logits[0] - - return (loss, logits, labels) - - def compute_loss_ort(self, model, inputs, return_outputs=False): - """ - How the loss is computed by ORTTrainer. By default, all models return the loss in the first element. - Subclass and override for custom behavior. - """ - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - outputs = model(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - if "text-generation" in self.feature: - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss - - def _export( - self, - model_path: os.PathLike, - model: Optional[PreTrainedModel] = None, - opset: Optional[int] = None, - device: str = "cpu", - with_loss: bool = True, - use_cache: bool = False, - ) -> None: - """ - Load and export a model to an ONNX format. - - Args: - model_path (`os.PathLike`): - The path used to save the model exported to an ONNX format. - model ([`PreTrainedModel`], *optional*): - The model to export. If not provided, a `model_path` must be passed. - opset (`int`, *optional*): - ONNX opset version to export the model with. - device (`str`, *optional*, defaults to `cpu`): - The device on which the ONNX model will be exported. Either `cpu` or `cuda`. - with_loss (`bool`, defaults to `True`): - Whether to export ONNX model with the loss in outputs. - """ - if model is None: - if not (self.args.fp16 and self.args.deepspeed): - # Taking CPU to export the model - self.model.to("cpu") - model = unwrap_model(self.model) - - onnx_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="onnx", task=self.feature - ) - onnx_config = onnx_config_constructor(model.config) - opset = onnx_config.DEFAULT_ONNX_OPSET if opset is None else opset - - is_decoder = isinstance(onnx_config, OnnxConfigWithPast) - - if is_decoder: - output_names = [ONNX_DECODER_NAME] - if use_cache is True: - output_names.append(ONNX_DECODER_WITH_PAST_NAME) - - models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config) - if with_loss is True: - opset = max(opset, 12) - models_and_onnx_configs_with_loss = {} - for decoder_name, (decoder, decoder_config) in models_and_onnx_configs.items(): - models_and_onnx_configs_with_loss[decoder_name] = ( - decoder, - wrap_onnx_config_for_loss(decoder_config), - ) - - export_models( - models_and_onnx_configs=models_and_onnx_configs_with_loss if with_loss else models_and_onnx_configs, - opset=opset, - output_dir=model_path, - output_names=output_names, - device=device, - disable_dynamic_axes_fix=True, # onnxruntime floating point exception (core dumped) - ) - else: - if with_loss is True: - onnx_config = wrap_onnx_config_for_loss(onnx_config) - opset = max(opset, 12) # Operators like `nll_loss`are added for opset>=12 - - output_path = model_path / ONNX_WEIGHTS_NAME - _ = export(model=model, config=onnx_config, opset=opset, output=output_path, device=device) - - model.config.save_pretrained(model_path) + return TrainOutput(self.state.global_step, train_loss, metrics) def _wrap_model(self, model, training=True, dataloader=None): # TODO: ipex only works with inference with PyTorch, will move `inference_with_ort` to training arguments and @@ -1674,18 +896,24 @@ def _wrap_model(self, model, training=True, dataloader=None): auto_wrap_policy = None auto_wrapper_callable = None - if self.args.fsdp_config["fsdp_min_num_params"] > 0: + default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None) + fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get( + "transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap + ) + + if self.args.fsdp_config["min_num_params"] > 0: auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"] + size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["min_num_params"] ) - elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None: + elif fsdp_transformer_layer_cls_to_wrap is not None: transformer_cls_to_wrap = set() - for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]: + for layer_class in fsdp_transformer_layer_cls_to_wrap: transformer_cls = get_module_class_from_name(model, layer_class) if transformer_cls is None: raise Exception("Could not find the transformer layer class to wrap in the model.") else: transformer_cls_to_wrap.add(transformer_cls) + auto_wrap_policy = functools.partial( transformer_auto_wrap_policy, # Transformer layer class to wrap diff --git a/optimum/onnxruntime/trainer_seq2seq.py b/optimum/onnxruntime/trainer_seq2seq.py index a281462fde..2e43ee89e0 100644 --- a/optimum/onnxruntime/trainer_seq2seq.py +++ b/optimum/onnxruntime/trainer_seq2seq.py @@ -14,40 +14,25 @@ """ The ORTSeq2SeqTrainer class, to easily train a sequence to sequence model in 🤗 Transformers from scratch or finetune it on a new task with ONNX Runtime. """ -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -import numpy as np import torch from torch import nn -from torch.utils.data import DataLoader, Dataset +from torch.utils.data import Dataset from transformers.deepspeed import is_deepspeed_zero3_enabled -from transformers.modeling_utils import PreTrainedModel, unwrap_model -from transformers.trainer_pt_utils import ( - DistributedTensorGatherer, - IterableDatasetShard, - SequentialDistributedSampler, - find_batch_size, - nested_concat, - nested_numpify, -) -from transformers.trainer_utils import ( - EvalLoopOutput, - EvalPrediction, - PredictionOutput, - denumpify_detensorize, - has_length, -) -from transformers.utils import logging - -from ..exporters import TasksManager -from ..exporters.onnx import export -from .modeling_ort import ORTModel -from .modeling_seq2seq import ORTModelForSeq2SeqLM +from transformers.trainer_utils import PredictionOutput +from transformers.utils import is_accelerate_available, logging + from .trainer import ORTTrainer -from .utils import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME, wrap_onnx_config_for_loss +if is_accelerate_available(): + pass +else: + raise ImportError( + "The package `accelerate` is required to use the ORTTrainer. Please install it following https://huggingface.co/docs/accelerate/basic_tutorials/install." + ) + logger = logging.get_logger(__name__) @@ -57,15 +42,19 @@ def evaluate( eval_dataset: Optional[Dataset] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", - inference_with_ort: bool = False, **gen_kwargs, ) -> Dict[str, float]: """ - Run evaluation with ONNX Runtime or PyTorch backend and returns metrics. + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. Args: eval_dataset (`Dataset`, *optional*): - Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns + Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` method. ignore_keys (`List[str]`, *optional*): @@ -73,7 +62,14 @@ def evaluate( gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is "eval" (default) + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The @@ -81,26 +77,26 @@ def evaluate( """ gen_kwargs = gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + + # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the + # training args + if ( + gen_kwargs.get("max_length") is None + and gen_kwargs.get("max_new_tokens") is None + and self.args.generation_max_length is not None + ): gen_kwargs["max_length"] = self.args.generation_max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) + if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None: + gen_kwargs["num_beams"] = self.args.generation_num_beams self._gen_kwargs = gen_kwargs - return super().evaluate( - eval_dataset, - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, - inference_with_ort=inference_with_ort, - ) + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) def predict( self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - inference_with_ort: bool = False, + metric_key_prefix: str = "test", **gen_kwargs, ) -> "PredictionOutput": """ @@ -111,20 +107,27 @@ def predict( Args: test_dataset (`Dataset`): - Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the + Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. Has to implement the method `__len__` ignore_keys (`List[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"test"`): + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "test_bleu" if the prefix is "test" (default) + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. - If your predictions or labels have different sequence length (for instance because you're doing dynamic padding - in a token classification task) the predictions will be padded (on the right) to allow for concatenation into - one array. The padding index is -100. + If your predictions or labels have different sequence lengths (for instance because you're doing dynamic + padding in a token classification task) the predictions will be padded (on the right) to allow for + concatenation into one array. The padding index is -100. @@ -137,486 +140,20 @@ def predict( """ gen_kwargs = gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.args.generation_max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) - self._gen_kwargs = gen_kwargs - - return super().predict( - test_dataset, - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, - inference_with_ort=inference_with_ort, - ) - - def evaluation_loop_ort( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> EvalLoopOutput: - """ - Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`. - - Works both with or without labels. - """ - logger.info("[INFO] ONNX Runtime inference starts...") - self.ort_model = None - - # Check if there are labels in the dataset - dummy_inputs = next(iter(dataloader)) - has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names) - - # Export ONNX models - if self.onnx_model_path and (has_labels == self.exported_with_loss): - logger.info("[INFO] Inference with given ONNX model") - self.onnx_model_path = Path(self.onnx_model_path).as_posix() - else: - onnx_model_path = Path(self.args.output_dir) - logger.info("[INFO] Exporting the model to ONNX...") - if self.args.deepspeed and self.args.fp16: - export_device = "cuda" - else: - export_device = "cpu" - - with_loss = has_labels and not self.label_smoother - # Only need to export decoders if the models have been exported before. - decoders_only = True if self.onnx_model_path else False - self._export(onnx_model_path, with_loss=with_loss, device=export_device, decoders_only=decoders_only) - - self.exported_with_loss = with_loss - self.onnx_model_path = onnx_model_path.as_posix() - logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path) - - args = self.args - # Load ORT model - self.ort_model = ORTModelForSeq2SeqLM.from_pretrained(model_id=self.onnx_model_path).to(args.device) - - prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only - - batch_size = dataloader.batch_size - - logger.info(f"***** Running {description} *****") - if has_length(dataloader): - logger.info(f" Num examples = {self.num_examples(dataloader)}") - else: - logger.info(" Num examples: Unknown") - logger.info(f" Batch size = {batch_size}") - - self.callback_handler.eval_dataloader = dataloader - # Do this before wrapping. - eval_dataset = getattr(dataloader, "dataset", None) - - if args.past_index >= 0: - self._past = None - - # Initialize containers - # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) - losses_host = None - preds_host = None - labels_host = None - inputs_host = None - - # losses/preds/labels on CPU (final containers) - all_losses = None - all_preds = None - all_labels = None - all_inputs = None - # Will be useful when we have an iterable dataset so don't know its length. - - observed_num_examples = 0 - # Main evaluation loop - for step, inputs in enumerate(dataloader): - # Update the observed num examples - observed_batch_size = find_batch_size(inputs) - if observed_batch_size is not None: - observed_num_examples += observed_batch_size - # For batch samplers, batch_size is not known by the dataloader in advance. - if batch_size is None: - batch_size = observed_batch_size - - # Prediction step(send also onnxruntime inference session) - loss, logits, labels = self.prediction_step_ort( - self.ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys - ) - inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None - - # Update containers on host - if loss is not None: - losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size))) - losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100) - if labels is not None: - labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100) - if inputs_decode is not None: - inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100) - inputs_decode = self.accelerator.gather_for_metrics((inputs_decode)) - inputs_host = ( - inputs_decode - if inputs_host is None - else nested_concat(inputs_host, inputs_decode, padding_index=-100) - ) - if logits is not None: - logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100) - if self.preprocess_logits_for_metrics is not None: - logits = self.preprocess_logits_for_metrics(logits, labels) - logits = self.accelerator.gather_for_metrics((logits)) - preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) - - if labels is not None: - labels = self.accelerator.gather_for_metrics((labels)) - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - - self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and self.accelerator.sync_gradients: - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode - if all_inputs is None - else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = ( - labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - ) - - # Set back to None to begin a new accumulation - losses_host, preds_host, inputs_host, labels_host = None, None, None, None - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of the evaluation loop - delattr(self, "_past") - - # Gather all remaining tensors and put them back on the CPU - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - - # Number of samples - if has_length(eval_dataset): - num_samples = len(eval_dataset) - # The instance check is weird and does not actually check for the type, but whether the dataset has the right - # methods. Therefore we need to make sure it also has the attribute. - elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0: - num_samples = eval_dataset.num_examples - else: - if has_length(dataloader): - num_samples = self.num_examples(dataloader) - else: # both len(dataloader.dataset) and len(dataloader) fail - num_samples = observed_num_examples - if num_samples == 0 and observed_num_examples > 0: - num_samples = observed_num_examples - - # Metrics! - if self.compute_metrics is not None and all_preds is not None and all_labels is not None: - if args.include_inputs_for_metrics: - metrics = self.compute_metrics( - EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) - ) - else: - metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) - else: - metrics = {} - - # To be JSON-serializable, we need to remove numpy types or zero-d tensors - metrics = denumpify_detensorize(metrics) - - if all_losses is not None: - metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) - - def prediction_loop_ort( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> PredictionOutput: - """ - Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`. - - Works both with or without labels. - """ - logger.info("[INFO] ONNX Runtime inference starts...") - self.ort_model = None - - # Check if there are labels in the dataset - dummy_inputs = next(iter(dataloader)) - has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names) - - # Export ONNX models - if self.onnx_model_path and (has_labels == self.exported_with_loss): - logger.info("[INFO] Inference with given ONNX model") - self.onnx_model_path = Path(self.onnx_model_path).as_posix() - else: - onnx_model_path = Path(self.args.output_dir) - logger.info("[INFO] Exporting the model to ONNX...") - if self.args.deepspeed and self.args.fp16: - export_device = "cuda" - else: - export_device = "cpu" - - with_loss = has_labels and not self.label_smoother - # Only need to export decoders if the models have been exported before. - decoders_only = True if self.onnx_model_path else False - self._export(onnx_model_path, with_loss=with_loss, device=export_device, decoders_only=decoders_only) - - self.exported_with_loss = with_loss - self.onnx_model_path = onnx_model_path.as_posix() - logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path) - - args = self.args - # Load ORT model - self.ort_model = ORTModelForSeq2SeqLM.from_pretrained( - model_id=self.onnx_model_path, provider="CUDAExecutionProvider" - ) - - if not has_length(dataloader): - raise ValueError("dataloader must implement a working __len__") - - prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only - - batch_size = dataloader.batch_size - num_examples = self.num_examples(dataloader) - logger.info(f"***** Running {description} *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Batch size = {batch_size}") - losses_host: torch.Tensor = None - preds_host: Union[torch.Tensor, List[torch.Tensor]] = None - labels_host: Union[torch.Tensor, List[torch.Tensor]] = None - inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None - - world_size = max(1, args.world_size) - - eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) - if not prediction_loss_only: - # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass - # a batch size to the sampler) - make_multiple_of = None - if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): - make_multiple_of = dataloader.sampler.batch_size - preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - - if args.past_index >= 0: - self._past = None - - self.callback_handler.eval_dataloader = dataloader - - for step, inputs in enumerate(dataloader): - loss, logits, labels = self.prediction_step_ort( - self.ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys - ) - inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None - - if loss is not None: - losses = loss.repeat(batch_size) - losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) - if logits is not None: - preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) - if labels is not None: - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - if inputs_decode is not None: - inputs_host = ( - inputs_decode - if inputs_host is None - else nested_concat(inputs_host, inputs_decode, padding_index=-100) - ) - self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) - if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) - inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) - - # Set back to None to begin a new accumulation - losses_host, preds_host, labels_host, inputs_host = None, None, None, None - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of the evaluation loop - delattr(self, "_past") - - # Gather all remaining tensors and put them back on the CPU - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) - if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) - inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) - - eval_loss = eval_losses_gatherer.finalize() - preds = preds_gatherer.finalize() if not prediction_loss_only else None - label_ids = labels_gatherer.finalize() if not prediction_loss_only else None - inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None - - if self.compute_metrics is not None and preds is not None and label_ids is not None: - if args.include_inputs_for_metrics: - metrics = self.compute_metrics( - EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids) - ) - else: - metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) - else: - metrics = {} - - # To be JSON-serializable, we need to remove numpy types or zero-d tensors - metrics = denumpify_detensorize(metrics) - - if eval_loss is not None: - metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) - - def prediction_step_ort( - self, - model: ORTModel, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - **gen_kwargs, - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on `model` using `inputs`. - - Subclass and override to inject custom behavior. - - Args: - model (`ORTModel`): - The model to evaluate. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (`bool`): - Whether or not to return the loss only. - ignore_keys (`Lst[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - gen_kwargs: - Additional `generate` specific kwargs. - - Return: - Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss=None, generated - tokens and labels (each being optional). - """ - if not self.args.predict_with_generate or prediction_loss_only: - return super().prediction_step_ort( - model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys - ) - - has_labels = "labels" in inputs - inputs = self._prepare_inputs(inputs) - - # Priority (handled in generate): - # gen_kwargs > model.generation_config > default GenerationConfig() - - if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): - gen_kwargs = self._gen_kwargs.copy() - - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.model.config.max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams - ) - default_synced_gpus = True if is_deepspeed_zero3_enabled() else False - gen_kwargs["synced_gpus"] = ( - gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus - ) - - # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate - # (otherwise, it would continue generating from the padded `decoder_input_ids`) + # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the + # training args if ( - "labels" in inputs - and "decoder_input_ids" in inputs - and inputs["labels"].shape == inputs["decoder_input_ids"].shape + gen_kwargs.get("max_length") is None + and gen_kwargs.get("max_new_tokens") is None + and self.args.generation_max_length is not None ): - inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} - generated_tokens = self.model.generate(**inputs, **gen_kwargs) - - # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop - # TODO: remove this hack when the legacy code that initializes generation_config from a model config is - # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 - if self.model.generation_config._from_model_config: - self.model.generation_config._from_model_config = False - # Retrieves GenerationConfig from model.generation_config - gen_config = self.model.generation_config - # in case the batch is shorter than max length, the output should be padded - if generated_tokens.shape[-1] < gen_config.max_length: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) - elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) - - with torch.no_grad(): - with self.compute_loss_context_manager(): - if self.label_smoother is not None: - onnx_inputs = {k: v for k, v in inputs.items() if k != "labels"} - outputs = model(**onnx_inputs) - else: - outputs = model(**inputs) - if has_labels: - if self.label_smoother is not None: - labels = inputs["labels"] - # With label smoother, loss will be calculated out of box - # So the outputs of InferenceSession need to be converted to tensor and sent to the same device - loss = self.label_smoother(outputs, labels.to(outputs.logits.device)).mean().detach() - else: - loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() - else: - loss = None - - if self.args.prediction_loss_only: - return loss, None, None - - if has_labels: - labels = inputs["labels"] - if labels.shape[-1] < gen_config.max_length: - labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) - elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: - labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) - else: - labels = None + gen_kwargs["max_length"] = self.args.generation_max_length + if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None: + gen_kwargs["num_beams"] = self.args.generation_num_beams + self._gen_kwargs = gen_kwargs - return loss, generated_tokens, labels + return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) def prediction_step( self, @@ -657,31 +194,31 @@ def prediction_step( has_labels = "labels" in inputs inputs = self._prepare_inputs(inputs) + # XXX: adapt synced_gpus for fairscale as well # Priority (handled in generate): - # gen_kwargs > model.generation_config > default GenerationConfig() - + # non-`None` gen_kwargs > model.generation_config > default GenerationConfig() if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): gen_kwargs = self._gen_kwargs.copy() + if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None: + gen_kwargs.pop("num_beams") + if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None: + gen_kwargs.pop("max_length") - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.model.config.max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams - ) default_synced_gpus = True if is_deepspeed_zero3_enabled() else False gen_kwargs["synced_gpus"] = ( gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus ) + generation_inputs = inputs.copy() # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate # (otherwise, it would continue generating from the padded `decoder_input_ids`) if ( - "labels" in inputs - and "decoder_input_ids" in inputs - and inputs["labels"].shape == inputs["decoder_input_ids"].shape + "labels" in generation_inputs + and "decoder_input_ids" in generation_inputs + and generation_inputs["labels"].shape == generation_inputs["decoder_input_ids"].shape ): - inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} - generated_tokens = self.model.generate(**inputs, **gen_kwargs) + generation_inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} + generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs) # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop # TODO: remove this hack when the legacy code that initializes generation_config from a model config is @@ -739,85 +276,3 @@ def _pad_tensors_to_max_len(self, tensor, max_length): ) padded_tensor[:, : tensor.shape[-1]] = tensor return padded_tensor - - def _export( - self, - save_dir: Union[str, Path], - model: Optional[PreTrainedModel] = None, - opset: Optional[int] = None, - device: str = "cpu", - with_loss: bool = True, - decoders_only: bool = False, - **kwargs, - ) -> None: - """ - Load and export a sequence-to-sequence model to ONNX models(encoder and decoder(s)). - - Args: - save_dir (`str` or `Path`): - The directory where the ONNX models(encoder, decoder...) should be saved, default to - `transformers.file_utils.default_cache_path`, which is the cache dir for transformers. - device (`str`, *optional*, defaults to `cpu`): - The device on which the ONNX model will be exported. Either `cpu` or `cuda`. - with_loss (`bool`, defaults to `True`): - Whether to export ONNX model with the loss in outputs. - decoders_only (`bool`, defaults to `False`): - Whether to just export decoder models. - """ - if model is None: - if not (self.args.fp16 and self.args.deepspeed): - # Taking CPU to export the model - self.model.to("cpu") - model = unwrap_model(self.model) - - onnx_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="onnx", task=self.feature - ) - onnx_config = onnx_config_constructor(model.config) - - opset = onnx_config.DEFAULT_ONNX_OPSET if opset is None else opset - - encoder = model.get_encoder() - - onnx_config_encoder = onnx_config.with_behavior("encoder") - onnx_config_decoder = onnx_config.with_behavior("decoder", use_past=False) - onnx_config_decoder_with_past = onnx_config.with_behavior("decoder", use_past=True) - - if with_loss: - # Add `loss` to the ONNX config of decoders - onnx_config_decoder = wrap_onnx_config_for_loss(onnx_config_decoder) - onnx_config_decoder_with_past = wrap_onnx_config_for_loss(onnx_config_decoder_with_past) - opset = max(opset, 12) # Operators like `nll_loss`are added for opset>=12 - - # Export the encoder - if not decoders_only: - _ = export( - model=encoder, - config=onnx_config_encoder, - opset=opset, - output=Path(save_dir).joinpath(ONNX_ENCODER_NAME), - device=device, - ) - # Export the decoder without the past key values - export( - model=model, - config=onnx_config_decoder, - opset=opset, - output=Path(save_dir).joinpath(ONNX_DECODER_NAME), - device=device, - ) - - # Export the decoder with the past key values - use_cache = kwargs.get("use_cache", True) - if use_cache: - export( - model=model, - config=onnx_config_decoder_with_past, - opset=opset, - output=Path(save_dir).joinpath(ONNX_DECODER_WITH_PAST_NAME), - device=device, - ) - - # TODO: Need to use merged decoder to reduce the use of GPU memory - - model.config.save_pretrained(save_dir) diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 88e5fc9bc2..a0cb7c8e98 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -34,6 +34,7 @@ from transformers.utils import ( ExplicitEnum, get_full_repo_name, + is_accelerate_available, is_safetensors_available, is_torch_available, is_torch_bf16_cpu_available, @@ -41,6 +42,7 @@ is_torch_tf32_available, logging, ) +from transformers.utils.generic import strtobool if is_torch_available(): @@ -137,8 +139,9 @@ def __post_init__(self): if self.load_best_model_at_end: if self.evaluation_strategy != self.save_strategy: raise ValueError( - "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation " - f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}" + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps " + f"{self.save_steps} and eval_steps {self.eval_steps}." ) if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: if self.eval_steps < 1 or self.save_steps < 1: @@ -189,14 +192,15 @@ def __post_init__(self): self.half_precision_backend = self.fp16_backend if self.bf16 or self.bf16_full_eval: - if self.no_cuda and not is_torch_bf16_cpu_available(): + if self.use_cpu and not is_torch_bf16_cpu_available(): # cpu raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10") - elif not self.no_cuda and torch.cuda.is_available() and not is_torch_bf16_gpu_available(): - # gpu - raise ValueError( - "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" - ) + elif not self.use_cpu: + if torch.cuda.is_available() and not is_torch_bf16_gpu_available(): + # gpu + raise ValueError( + "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" + ) if self.fp16 and self.bf16: raise ValueError("At most one of fp16 and bf16 can be True, but not both") @@ -286,6 +290,7 @@ def __post_init__(self): " otherwise." ) torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True else: logger.warning( "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here." @@ -294,13 +299,24 @@ def __post_init__(self): if self.tf32: if is_torch_tf32_available(): torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True else: raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7") else: if is_torch_tf32_available(): torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False # no need to assert on else + # if training args is specified, it will override the one specified in the accelerate config + if self.half_precision_backend != "apex": + mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no") + if self.fp16: + mixed_precision_dtype = "fp16" + elif self.bf16: + mixed_precision_dtype = "bf16" + os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype + if self.report_to is None: logger.info( "The default value for the training argument `--report_to` will change in v5 (from all installed " @@ -342,44 +358,44 @@ def __post_init__(self): self.fsdp_config = {} if isinstance(self.fsdp_config, str): + if len(self.fsdp) == 0: + warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.") with io.open(self.fsdp_config, "r", encoding="utf-8") as f: self.fsdp_config = json.load(f) + for k in list(self.fsdp_config.keys()): + if k.startswith("fsdp_"): + v = self.fsdp_config.pop(k) + self.fsdp_config[k[5:]] = v if self.fsdp_min_num_params > 0: warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning) - self.fsdp_config["fsdp_min_num_params"] = max( - self.fsdp_config.get("fsdp_min_num_params", 0), self.fsdp_min_num_params - ) + self.fsdp_config["min_num_params"] = max(self.fsdp_config.get("min_num_params", 0), self.fsdp_min_num_params) - # if fsdp_config["fsdp_transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object - if isinstance(self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None), str): - self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = [ - self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] - ] + # if fsdp_config["transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object + if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str): + self.fsdp_config["transformer_layer_cls_to_wrap"] = [self.fsdp_config["transformer_layer_cls_to_wrap"]] if self.fsdp_transformer_layer_cls_to_wrap is not None: warnings.warn( "using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning ) - self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = self.fsdp_config.get( - "fsdp_transformer_layer_cls_to_wrap", [] + self.fsdp_config["transformer_layer_cls_to_wrap"] = self.fsdp_config.get( + "transformer_layer_cls_to_wrap", [] ) + [self.fsdp_transformer_layer_cls_to_wrap] - if len(self.fsdp) == 0 and self.fsdp_config["fsdp_min_num_params"] > 0: - warnings.warn("`--fsdp_min_num_params` is useful only when `--fsdp` is specified.") + if len(self.fsdp) == 0 and self.fsdp_config["min_num_params"] > 0: + warnings.warn("`min_num_params` is useful only when `--fsdp` is specified.") - if len(self.fsdp) == 0 and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None: - warnings.warn("`--fsdp_transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.") + if len(self.fsdp) == 0 and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None: + warnings.warn("`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.") if ( len(self.fsdp) > 0 - and self.fsdp_config["fsdp_min_num_params"] > 0 - and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None + and self.fsdp_config["min_num_params"] > 0 + and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None ): - raise ValueError( - "`--fsdp_min_num_params` and `--fsdp_transformer_layer_cls_to_wrap` are mutually exclusive." - ) + raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.") self.fsdp_config["xla"] = self.fsdp_config.get("xla", False) self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False) if self.fsdp_config["xla"]: @@ -405,23 +421,29 @@ def __post_init__(self): FSDP_SHARDING_STRATEGY, ) + prefix = "FSDP_" for fsdp_option in self.fsdp: if fsdp_option.upper() in FSDP_SHARDING_STRATEGY: # set environment variable for FSDP sharding strategy - os.environ["FSDP_SHARDING_STRATEGY"] = str(FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1) + os.environ[f"{prefix}SHARDING_STRATEGY"] = str( + FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1 + ) elif fsdp_option == FSDPOption.OFFLOAD: - os.environ["FSDP_OFFLOAD_PARAMS"] = "true" + os.environ[f"{prefix}OFFLOAD_PARAMS"] = "true" elif fsdp_option == FSDPOption.AUTO_WRAP: - if self.fsdp_config["fsdp_min_num_params"] > 0: - os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"]) - os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1] - elif self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None: - os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join( - self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] + os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0] + if self.fsdp_config["min_num_params"] > 0: + os.environ[f"{prefix}MIN_NUM_PARAMS"] = str(self.fsdp_config["min_num_params"]) + os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1] + elif self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None: + os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"] = ",".join( + self.fsdp_config["transformer_layer_cls_to_wrap"] ) - os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0] prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH") - os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper() + os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper() + os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefect", "false") + os.environ[f"{prefix}SYNC_MODULE_STATES"] = self.fsdp_config.get("sync_module_states", "true") + os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "false") if self.tpu_metrics_debug: warnings.warn( @@ -444,7 +466,9 @@ def __post_init__(self): if self.deepspeed: # - must be run very last in arg parsing, since it will use a lot of these settings. # - must be run before the model is created. - from transformers.deepspeed import HfTrainerDeepSpeedConfig + if not is_accelerate_available(): + raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.") + from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig # will be used later by the Trainer # note: leave self.deepspeed unmodified in case a user relies on it not to be modified) @@ -456,6 +480,14 @@ def __post_init__(self): os.environ["ACCELERATE_USE_DEEPSPEED"] = "true" self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config) + elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")): + # Accelerate DeepSpeed Plugin + from accelerate.utils import DeepSpeedPlugin + + self.deepspeed_plugin = DeepSpeedPlugin() + mixed_precision = os.environ.get("ACCELERATE_MIXED_PRECISION", "no") + self.deepspeed_plugin.set_mixed_precision(mixed_precision) + self.deepspeed_plugin.set_deepspeed_weakref() if self.push_to_hub_token is not None: warnings.warn( diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer index 62f7efc817..7266ba224a 100644 --- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer +++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer @@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y @@ -65,7 +65,7 @@ RUN $PYTHON_EXE -m pip install onnx ninja RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION} # ORT Module -RUN $PYTHON_EXE -m pip install onnxruntime-training==1.15.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html +RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html RUN $PYTHON_EXE -m pip install torch-ort ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2 @@ -76,4 +76,5 @@ COPY . /workspace/optimum RUN pip install /workspace/optimum[tests] ENV TEST_LEVEL=1 -CMD RUN_SLOW=1 pytest -v -rs onnxruntime/nightly_test_trainer.py --durations=0 \ No newline at end of file +CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_trainer.py --durations=0 +CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_examples.py --durations=0 \ No newline at end of file diff --git a/tests/onnxruntime/training/nightly_test_examples.py b/tests/onnxruntime/training/nightly_test_examples.py new file mode 100644 index 0000000000..a16913a097 --- /dev/null +++ b/tests/onnxruntime/training/nightly_test_examples.py @@ -0,0 +1,219 @@ +# coding=utf-8 +# Copyright 2023 the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test ONNX Runtime Training Examples in Optimum.""" + +import subprocess +import unittest + +import pytest +from transformers.testing_utils import slow + + +@slow +class ORTTrainerExampleTest(unittest.TestCase): + def test_text_classification(self): + subprocess.run( + "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_glue.py" + " --model_name_or_path distilbert-base-uncased" + " --task_name mnli" + " --max_seq_length 64" + " --learning_rate 3e-6" + " --do_train" + " --output_dir /tmp/distilbert" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 8" + " --fp16 --optim adamw_ort_fused" + " --max_train_samples 20", + shell=True, + check=True, + ) + + def test_token_classification(self): + subprocess.run( + "cp ../examples/onnxruntime/training/token-classification/run_ner.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_ner.py" + " --model_name_or_path bert-base-cased" + " --dataset_name conll2003" + " --do_train" + " --output_dir /tmp/bert" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 8" + " --fp16 --optim adamw_ort_fused" + " --max_train_samples 20", + shell=True, + check=True, + ) + + def test_translation(self): + subprocess.run( + "cp ../examples/onnxruntime/training/translation/run_translation.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_translation.py" + " --model_name_or_path t5-small" + " --dataset_name wmt16" + " --dataset_config ro-en" + " --label_smoothing 0.1" + " --predict_with_generate" + " --source_lang en" + " --target_lang ro" + " --do_train" + " --max_train_samples 30" + " --output_dir /tmp/t5" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 2" + " --fp16 --optim adamw_ort_fused", + shell=True, + check=True, + ) + + @pytest.mark.skip(reason="skip for now") + def test_summarization(self): + subprocess.run( + "cp ../examples/onnxruntime/training/summarization/run_summarization.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_summarization.py" + " --model_name_or_path t5-small" + " --do_train" + " --do_eval" + " --dataset_name cnn_dailymail" + ' --dataset_config "3.0.0"' + ' --source_prefix "summarize: "' + " --predict_with_generate" + " --max_train_samples 30" + " --output_dir /tmp/t5" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 2" + " --per_device_eval_batch_size 2" + " --fp16 --optim adamw_ort_fused", + shell=True, + check=True, + ) + + # TODO: Update the example and add the test + def test_stable_diffusion_txt2img(self): + pass + + @pytest.mark.skip(reason="skip for now") + def test_question_answering(self): + subprocess.run( + "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_qa.py" + " --model_name_or_path bert-base-uncased" + " --do_train" + " --do_eval" + " --dataset_name squad" + " --max_train_samples 30" + " --output_dir /tmp/bert" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 2" + " --per_device_eval_batch_size 2" + " --fp16 --optim adamw_ort_fused", + shell=True, + check=True, + ) + + @pytest.mark.skip(reason="skip for now") + def test_language_modeling(self): + subprocess.run( + "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_clm.py" + " --model_name_or_path gpt2" + " --do_train" + " --do_eval" + " --dataset_name wikitext" + " --dataset_config_name wikitext-2-raw-v1" + " --max_train_samples 30" + " --output_dir /tmp/gpt2" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 2" + " --per_device_eval_batch_size 2" + " --fp16 --optim adamw_ort_fused", + shell=True, + check=True, + ) + + @pytest.mark.skip(reason="skip for now") + def test_image_classification(self): + subprocess.run( + "cp ../examples/onnxruntime/training/image-classification/run_image_classification.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_image_classification.py" + " --model_name_or_path google/vit-base-patch16-224-in21k" + " --do_train" + " --do_eval" + " --dataset_name beans" + " --max_train_samples 30" + " --output_dir /tmp/vit" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 2" + " --per_device_eval_batch_size 2" + " --fp16 --optim adamw_ort_fused", + shell=True, + check=True, + ) diff --git a/tests/onnxruntime/nightly_test_trainer.py b/tests/onnxruntime/training/nightly_test_trainer.py similarity index 54% rename from tests/onnxruntime/nightly_test_trainer.py rename to tests/onnxruntime/training/nightly_test_trainer.py index 2eb3ca433f..e24ee30617 100644 --- a/tests/onnxruntime/nightly_test_trainer.py +++ b/tests/onnxruntime/training/nightly_test_trainer.py @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Test ONNX Runtime Training ORTTrainer in Optimum.""" import gc +import os import random -import subprocess -import sys import tempfile import unittest from itertools import chain @@ -25,7 +25,6 @@ import nltk import numpy as np -import pytest from datasets import load_dataset from evaluate import load from transformers import ( @@ -35,12 +34,16 @@ AutoModelForTokenClassification, AutoTokenizer, DataCollatorForSeq2Seq, - DataCollatorForTokenClassification, DataCollatorWithPadding, default_data_collator, is_torch_available, ) -from transformers.testing_utils import require_deepspeed, require_torch, slow +from transformers.testing_utils import ( + mockenv_context, + require_deepspeed, + require_torch, + slow, +) from transformers.training_args import OptimizerNames @@ -75,11 +78,11 @@ "data_collator": default_data_collator, "data_collator_class": DataCollatorWithPadding, }, - "token-classification": { - "dataset": ["conll2003"], - "metric": ["seqeval"], - "data_collator_class": DataCollatorForTokenClassification, - }, + # "token-classification": { + # "dataset": ["conll2003"], + # "metric": ["seqeval"], + # "data_collator_class": DataCollatorForTokenClassification, + # }, } _DECODER_TASKS_DATASETS_CONFIGS = { @@ -88,11 +91,6 @@ "metric": ["accuracy"], "data_collator": default_data_collator, }, - "text-generation-with-past": { - "dataset": ["wikitext", "wikitext-2-raw-v1"], - "metric": ["accuracy"], - "data_collator": default_data_collator, - }, } _SEQ2SEQ_TASKS_DATASETS_CONFIGS = { @@ -101,30 +99,37 @@ "metric": ["rouge"], "data_collator_class": DataCollatorForSeq2Seq, }, - "text2text-generation-with-past": { - "dataset": ["xsum"], - "metric": ["rouge"], - "data_collator_class": DataCollatorForSeq2Seq, - }, } +# List supported ORT optimizers to test +optim_test_params = [] +if is_torch_available(): + default_adam_kwargs = { + "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2), + "eps": ORTTrainingArguments.adam_epsilon, + "lr": ORTTrainingArguments.learning_rate, + } -def _get_models_to_test(model_list, task_list, both_inf_backend=False, excluded: Optional[List[str]] = None): + optim_test_params = [ + ( + ORTOptimizerNames.ADAMW_ORT_FUSED, + onnxruntime.training.optim.FusedAdam, + default_adam_kwargs, + ), + ] + +# default torch.distributed port +DEFAULT_MASTER_PORT = "10999" + + +def _get_models_to_test(model_list, task_list, excluded: Optional[List[str]] = None): models_to_test = [] for name, model_name in model_list: - for feature, data_metric_config in task_list.items(): - if excluded and (name in excluded or feature in excluded): + for task, data_metric_config in task_list.items(): + if excluded and (name in excluded or task in excluded): continue - if both_inf_backend: - models_to_test.append( - (f"{name}_{feature}", model_name, feature, data_metric_config, True) - ) # inference_with_ort=True - models_to_test.append( - (f"{name}_{feature}", model_name, feature, data_metric_config, False) - ) # inference_with_ort=False - else: - models_to_test.append((f"{name}_{feature}", model_name, feature, data_metric_config)) + models_to_test.append((f"{name}_{task}", model_name, task, data_metric_config)) return sorted(models_to_test) @@ -151,17 +156,39 @@ def _get_data_collator(data_metric_config, tokenizer=None, model=None, training_ return data_collator -def get_ort_training_args(feature, **kwargs): - if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS: +def get_ort_training_args(task, **kwargs): + if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS: training_args = ORTTrainingArguments(**kwargs) - elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: + elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: training_args = ORTSeq2SeqTrainingArguments(**kwargs) return training_args +def get_master_port(real_launcher=False): + """ + When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed) + the issue is that once the port is tied it can't be used anywhere else outside of this process, + since torch.dist doesn't free the port until the process exits. Therefore for the sake of being + able to run both emulated launcher and normal launcher tests we need 2 distinct ports. + + This function will give the right port in the right context. For real launcher it'll give the + base port, for emulated launcher it'll give the base port + 1. In both cases a string is + returned. + + Args: + `real_launcher`: whether a real launcher is going to be used, or the emulated one + + """ + + master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + if not real_launcher: + master_port_base = str(int(master_port_base) + 1) + return master_port_base + + def get_ort_trainer( model_name, - feature, + task, data_metric_config, training_args, max_seq_length=None, @@ -170,7 +197,7 @@ def get_ort_trainer( max_test_samples=None, **kwargs, ): - training_kwargs = load_and_prepare(feature)( + training_kwargs = load_and_prepare(task)( model_name, data_metric_config, max_seq_length, @@ -185,26 +212,25 @@ def get_ort_trainer( if getattr(training_args, "predict_with_generate", False) is not True: training_kwargs.pop("compute_metrics", None) - if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS: - trainer = ORTTrainer(feature=feature, args=training_args, **training_kwargs) - elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: - trainer = ORTSeq2SeqTrainer(feature=feature, args=training_args, **training_kwargs) + if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS: + trainer = ORTTrainer(args=training_args, **training_kwargs) + elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: + trainer = ORTSeq2SeqTrainer(args=training_args, **training_kwargs) else: raise return trainer, test_dataset -def load_and_prepare(feature): +def load_and_prepare(task): preprocess_mapping = { "text-classification": load_and_prepare_glue, "token-classification": load_and_prepare_ner, "text-generation": load_and_prepare_clm, "text-generation-with-past": load_and_prepare_clm, "text2text-generation": load_and_prepare_xsum, - "text2text-generation-with-past": load_and_prepare_xsum, } - return preprocess_mapping[feature] + return preprocess_mapping[task] def load_and_prepare_glue(model_name, data_metric_config, max_seq_length, padding="max_length", **kwargs): @@ -520,212 +546,140 @@ class ORTTrainerIntegrationTest(unittest.TestCase): def setUp(self): super().setUp() args = ORTTrainingArguments("..") + master_port = get_master_port(real_launcher=False) + self.dist_env_1_gpu = { + "MASTER_ADDR": "localhost", + "MASTER_PORT": master_port, + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + } self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size + self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2) + self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2) self.max_seq_length = 64 - self.max_train_samples = 50 - self.max_valid_samples = 20 - self.max_test_samples = 10 + self.max_train_samples = 10 + self.max_valid_samples = 5 + self.max_test_samples = 5 self.warmup_steps = 10 self.weight_decay = 0.01 @parameterized.expand( - _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) # Skip test for OOM bug - + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True), - skip_on_empty=True, - ) - def test_trainer_fp32(self, test_name, model_name, feature, data_metric_config, inference_with_ort): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=inference_with_ort) - trainer.predict(test_dataset, inference_with_ort=inference_with_ort) - gc.collect() - - @parameterized.expand( - _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) # Skip test for OOM bug - + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True), + _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp32_with_label_smoothing( - self, test_name, model_name, feature, data_metric_config, inference_with_ort - ): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - label_smoothing_factor=0.1, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=inference_with_ort) - trainer.predict(test_dataset, inference_with_ort=inference_with_ort) - gc.collect() + def test_trainer_fp32(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @parameterized.expand( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) # Skip test for OOM bug + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp16_pt_inference(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate() - trainer.predict(test_dataset) - gc.collect() + def test_trainer_fp32_with_label_smoothing(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + label_smoothing_factor=0.1, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @parameterized.expand( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # Exclude "with-past" tests as they fail for ORT inference after the mixed-precision training - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, excluded=["text-generation-with-past"]) # Skip test for OOM bug - + _get_models_to_test( - _SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, excluded=["text2text-generation-with-past"] - ), + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp16_ort_inference(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=True) - trainer.predict(test_dataset, inference_with_ort=True) - gc.collect() - - # Skip this test as a large amount of ops don't support bf16 yet. - # @unittest.skip("Skip BF16 test.") - # @slow - # @require_torch_bf16_gpu - # @parameterized.expand( - # _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - # skip_on_empty=True, - # ) - # def test_trainer_bf16(self, test_name, model_name, feature, data_metric_config): - # with tempfile.TemporaryDirectory() as tmp_dir: - # training_args = get_ort_training_args( - # feature=feature, - # output_dir=tmp_dir, - # num_train_epochs=self.n_epochs, - # per_device_train_batch_size=self.per_device_train_batch_size, - # per_device_eval_batch_size=self.per_device_eval_batch_size, - # warmup_steps=self.warmup_steps, - # weight_decay=self.weight_decay, - # logging_dir=tmp_dir, - # bf16=True, - # ) - - # trainer, test_dataset = get_ort_trainer( - # model_name, - # feature, - # data_metric_config, - # training_args, - # max_seq_length=self.max_seq_length, - # max_train_samples=self.max_train_samples, - # max_valid_samples=self.max_valid_samples, - # max_test_samples=self.max_test_samples, - # ) - - # trainer.train() - # trainer.save_model() - # trainer.evaluate() - # trainer.predict(test_dataset) - # gc.collect() + def test_trainer_fp16(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @@ -734,14 +688,22 @@ class ORTTrainerIntegrationDeepSpeedTest(unittest.TestCase): def setUp(self): super().setUp() args = ORTTrainingArguments("..") + master_port = get_master_port(real_launcher=False) + self.dist_env_1_gpu = { + "MASTER_ADDR": "localhost", + "MASTER_PORT": master_port, + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + } self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size + self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2) + self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2) self.max_seq_length = 64 - self.max_train_samples = 30 - self.max_valid_samples = 10 - self.max_test_samples = 10 + self.max_train_samples = 10 + self.max_valid_samples = 5 + self.max_test_samples = 5 self.warmup_steps = 10 self.weight_decay = 0.01 @@ -749,126 +711,80 @@ def setUp(self): @parameterized.expand( random.sample( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - 1, + 1, # only test one ), skip_on_empty=True, ) - def test_trainer_fp16_ds_stage1(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json", - ) - - trainer, _ = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() + def test_trainer_fp16_ds_stage1(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json", + ) + + trainer, _ = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + gc.collect() @parameterized.expand( random.sample( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), 1, ), skip_on_empty=True, ) - def test_trainer_fp16_ds_stage2(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json", - ) - - trainer, _ = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() - - -@slow -@pytest.mark.skip(reason="skip for now, server socket error") -class ORTTrainerIntegrationDDPTest(unittest.TestCase): - def test_trainer_ddp_glue(self): - subprocess.run( - "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./", - shell=True, - ) - - subprocess.run( - f"{sys.executable} -m torch.distributed.launch" - " --nproc_per_node=1" - " run_glue.py" - " --model_name_or_path distilbert-base-uncased" - " --task_name mnli" - " --max_seq_length 128" - " --learning_rate 3e-6" - " --do_train" - " --output_dir /tmp/distilbert" - " --overwrite_output_dir" - " --max_steps 200" - " --logging_steps 20" - " --per_device_train_batch_size 32" - " --fp16 --optim adamw_ort_fused" - " --max_train_samples 500", - shell=True, - check=True, - ) - - -# List supported ORT optimizers to test -optim_test_params = [] -if is_torch_available(): - default_adam_kwargs = { - "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2), - "eps": ORTTrainingArguments.adam_epsilon, - "lr": ORTTrainingArguments.learning_rate, - } - - optim_test_params = [ - ( - ORTOptimizerNames.ADAMW_ORT_FUSED, - onnxruntime.training.optim.FusedAdam, - default_adam_kwargs, - ), - ] + def test_trainer_fp16_ds_stage2(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json", + ) + + trainer, _ = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + gc.collect() @slow @@ -876,21 +792,6 @@ def test_trainer_ddp_glue(self): class ORTTrainerOptimizerChoiceTest(unittest.TestCase): def setUp(self): super().setUp() - args = ORTTrainingArguments("..") - self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size - - self.max_seq_length = 64 - self.max_train_samples = 50 - self.max_valid_samples = 20 - self.max_test_samples = 10 - - self.warmup_steps = 10 - self.weight_decay = 0.01 - - self.model_name = "bert-base-cased" - self.feature = "text-classification" def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expected_cls): args = ORTTrainingArguments(optim=optim, output_dir="None") @@ -903,37 +804,6 @@ def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expect actual_v = optim_kwargs[p] self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.") - @parameterized.expand(optim_test_params, skip_on_empty=True) - def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs): - # exercises all the valid --optim options - self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls) - - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = ORTTrainingArguments( - optim=name, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, _ = get_ort_trainer( - self.model_name, - self.feature, - _ENCODER_TASKS_DATASETS_CONFIGS[self.feature], - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() - def test_ort_fused_adam(self): # Pretend that onnxruntime-training is installed and mock onnxruntime.training.optim.FusedAdam exists. # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the @@ -951,56 +821,3 @@ def test_ort_fused_adam(self): default_adam_kwargs, mock.optimizers.FusedAdam, ) - - -class ORTSeq2SeqTrainerSpecificIntegrationTest(unittest.TestCase): - def setUp(self): - super().setUp() - args = ORTTrainingArguments("..") - self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size - - self.max_seq_length = 32 - self.max_train_samples = 10 - self.max_valid_samples = 10 - self.max_test_samples = 10 - - self.warmup_steps = 10 - self.weight_decay = 0.01 - - @parameterized.expand( - _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - skip_on_empty=True, - ) - def test_predict_with_generate_ort(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - evaluation_strategy="epoch", - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - label_smoothing_factor=0.1, - predict_with_generate=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.evaluate(inference_with_ort=True) - trainer.predict(test_dataset, inference_with_ort=True) - gc.collect()