diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
index 668e5e5669..3e6841453b 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
@@ -22,6 +22,7 @@ CMD nvidia-smi
 ENV DEBIAN_FRONTEND noninteractive
 
 # Versions
+# available options 3.8, 3.9, 3.10, 3.11
 ARG PYTHON_VERSION=3.9
 ARG TORCH_CUDA_VERSION=cu118
 ARG TORCH_VERSION=2.0.0
@@ -34,7 +35,7 @@ SHELL ["/bin/bash", "-c"]
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
index db2219b5c6..15df7c352f 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
@@ -33,7 +33,7 @@ ARG TORCHVISION_VERSION=0.14.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
index 51c9ec514c..2d1306e1a3 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
@@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
index 3f6b833592..482d495fcb 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
@@ -34,7 +34,7 @@ SHELL ["/bin/bash", "-c"]
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/image-classification/run_image_classification.py b/examples/onnxruntime/training/image-classification/run_image_classification.py
index 837cb57a4b..ec8de0b52d 100644
--- a/examples/onnxruntime/training/image-classification/run_image_classification.py
+++ b/examples/onnxruntime/training/image-classification/run_image_classification.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -54,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
@@ -141,12 +142,28 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -162,32 +179,24 @@ def collate_fn(examples):
     return {"pixel_values": pixel_values, "labels": labels}
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -200,6 +209,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -209,7 +222,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -238,7 +251,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -285,7 +298,8 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -293,14 +307,16 @@ def compute_metrics(p):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image.
@@ -367,7 +383,6 @@ def val_transforms(example_batch):
         compute_metrics=compute_metrics,
         tokenizer=image_processor,
         data_collator=collate_fn,
-        feature="image-classification",
     )
 
     # Training
@@ -385,7 +400,7 @@ def val_transforms(example_batch):
 
     # Evaluation
     if training_args.do_eval:
-        metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort)
+        metrics = trainer.evaluate()
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
diff --git a/examples/onnxruntime/training/language-modeling/run_clm.py b/examples/onnxruntime/training/language-modeling/run_clm.py
index 2807d3f721..bd9694ae41 100644
--- a/examples/onnxruntime/training/language-modeling/run_clm.py
+++ b/examples/onnxruntime/training/language-modeling/run_clm.py
@@ -24,12 +24,14 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 
 import datasets
 import evaluate
+import torch
 import transformers
 from datasets import load_dataset
 from transformers import (
@@ -52,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -73,7 +75,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -108,12 +110,47 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
+                "set True will benefit LLM loading time and RAM consumption."
             )
         },
     )
@@ -160,7 +197,7 @@ class DataTrainingArguments:
             )
         },
     )
-
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
     block_size: Optional[int] = field(
         default=None,
         metadata={
@@ -189,6 +226,9 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
@@ -200,32 +240,24 @@ def __post_init__(self):
                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -238,6 +270,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -248,7 +284,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -285,7 +321,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -293,14 +330,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
     else:
         data_files = {}
@@ -321,7 +360,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -331,7 +370,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -339,7 +378,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
 
@@ -355,7 +394,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -373,7 +413,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -381,32 +422,44 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
     if model_args.model_name_or_path:
+        torch_dtype = (
+            model_args.torch_dtype
+            if model_args.torch_dtype in ["auto", None]
+            else getattr(torch, model_args.torch_dtype)
+        )
         model = AutoModelForCausalLM.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
-        model = AutoModelForCausalLM.from_config(config)
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
 
-    model.resize_token_embeddings(len(tokenizer))
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
 
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
+        column_names = list(raw_datasets["train"].features)
     else:
-        column_names = raw_datasets["validation"].column_names
+        column_names = list(raw_datasets["validation"].features)
     text_column_name = "text" if "text" in column_names else column_names[0]
 
     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
@@ -424,27 +477,34 @@ def tokenize_function(examples):
         return output
 
     with training_args.main_process_first(desc="dataset map tokenization"):
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
+        if not data_args.streaming:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset",
+            )
+        else:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                remove_columns=column_names,
+            )
 
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
-        if block_size > 1024:
+        if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
             )
         block_size = min(data_args.block_size, tokenizer.model_max_length)
@@ -454,10 +514,9 @@ def group_texts(examples):
         # Concatenate all texts.
         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -471,16 +530,22 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     with training_args.main_process_first(desc="grouping texts together"):
-        lm_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc=f"Grouping texts in chunks of {block_size}",
-        )
+        if not data_args.streaming:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+        else:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+            )
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:
@@ -528,7 +593,6 @@ def compute_metrics(eval_preds):
         preprocess_logits_for_metrics=preprocess_logits_for_metrics
         if training_args.do_eval and not is_torch_tpu_available()
         else None,
-        feature="text-generation",
     )
 
     # Training
@@ -556,7 +620,7 @@ def compute_metrics(eval_preds):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort)
+        metrics = trainer.evaluate()
 
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
diff --git a/examples/onnxruntime/training/language-modeling/run_mlm.py b/examples/onnxruntime/training/language-modeling/run_mlm.py
index 122395a1cd..3365ca8703 100755
--- a/examples/onnxruntime/training/language-modeling/run_mlm.py
+++ b/examples/onnxruntime/training/language-modeling/run_mlm.py
@@ -25,6 +25,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -52,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -71,7 +72,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -106,12 +107,37 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
+                "set True will benefit LLM loading time and RAM consumption."
             )
         },
     )
@@ -196,8 +222,12 @@ class DataTrainingArguments:
             )
         },
     )
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
 
     def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
@@ -211,32 +241,24 @@ def __post_init__(self):
                     raise ValueError("`validation_file` should be a csv, a json or a txt file.")
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -249,6 +271,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -259,7 +285,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
@@ -297,7 +323,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -305,14 +332,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
     else:
         data_files = {}
@@ -328,7 +357,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -338,14 +367,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -359,7 +388,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -377,7 +407,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -385,7 +416,7 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
@@ -396,34 +427,41 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
-    model.resize_token_embeddings(len(tokenizer))
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
 
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
+        column_names = list(raw_datasets["train"].features)
     else:
-        column_names = raw_datasets["validation"].column_names
+        column_names = list(raw_datasets["validation"].features)
     text_column_name = "text" if "text" in column_names else column_names[0]
 
     if data_args.max_seq_length is None:
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
             logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
             )
             max_seq_length = 1024
     else:
         if data_args.max_seq_length > tokenizer.model_max_length:
             logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
         max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -448,14 +486,21 @@ def tokenize_function(examples):
             )
 
         with training_args.main_process_first(desc="dataset map tokenization"):
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=[text_column_name],
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on dataset line_by_line",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=[text_column_name],
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc="Running tokenizer on dataset line_by_line",
+                )
+            else:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    remove_columns=[text_column_name],
+                )
     else:
         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
@@ -464,14 +509,21 @@ def tokenize_function(examples):
             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
 
         with training_args.main_process_first(desc="dataset map tokenization"):
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on every text in dataset",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=column_names,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc="Running tokenizer on every text in dataset",
+                )
+            else:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    remove_columns=column_names,
+                )
 
         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
         # max_seq_length.
@@ -479,10 +531,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -495,16 +546,22 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         with training_args.main_process_first(desc="grouping texts together"):
-            tokenized_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc=f"Grouping texts in chunks of {max_seq_length}",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc=f"Grouping texts in chunks of {max_seq_length}",
+                )
+            else:
+                tokenized_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                )
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:
@@ -563,7 +620,6 @@ def compute_metrics(eval_preds):
         preprocess_logits_for_metrics=preprocess_logits_for_metrics
         if training_args.do_eval and not is_torch_tpu_available()
         else None,
-        feature="fill-mask",
     )
 
     # Training
@@ -590,7 +646,7 @@ def compute_metrics(eval_preds):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort)
+        metrics = trainer.evaluate()
 
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
diff --git a/examples/onnxruntime/training/question-answering/run_qa.py b/examples/onnxruntime/training/question-answering/run_qa.py
index ec93d45c57..08b581a1a8 100644
--- a/examples/onnxruntime/training/question-answering/run_qa.py
+++ b/examples/onnxruntime/training/question-answering/run_qa.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -49,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -79,12 +80,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -214,32 +231,24 @@ def __post_init__(self):
                 assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -252,6 +261,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -262,7 +275,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -299,13 +312,14 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
         if data_args.train_file is not None:
             data_files["train"] = data_args.train_file
             extension = data_args.train_file.split(".")[-1]
+
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
             extension = data_args.validation_file.split(".")[-1]
@@ -317,7 +331,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -331,14 +345,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -346,7 +362,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Tokenizer check: this script requires a fast tokenizer.
@@ -374,7 +391,7 @@ def main():
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -599,12 +616,12 @@ def post_processing_function(examples, features, predictions, stage="eval"):
         # Format the result to the format the metric expects.
         if data_args.version_2_with_negative:
             formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+                {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
             ]
         else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+            formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
 
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
 
     metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
@@ -623,7 +640,6 @@ def compute_metrics(p: EvalPrediction):
         data_collator=data_collator,
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
-        feature="question-answering",
     )
 
     # Training
@@ -649,7 +665,7 @@ def compute_metrics(p: EvalPrediction):
     # Evaluation
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort)
+        metrics = trainer.evaluate()
 
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
@@ -660,10 +676,7 @@ def compute_metrics(p: EvalPrediction):
     # Prediction
     if training_args.do_predict:
         logger.info("*** Predict ***")
-        results = trainer.predict(
-            predict_dataset, predict_examples, inference_with_ort=inference_args.inference_with_ort
-        )
-
+        results = trainer.predict(predict_dataset, predict_examples)
         metrics = results.metrics
 
         max_predict_samples = (
diff --git a/examples/onnxruntime/training/question-answering/trainer_qa.py b/examples/onnxruntime/training/question-answering/trainer_qa.py
index 695ca92927..26ea820ace 100644
--- a/examples/onnxruntime/training/question-answering/trainer_qa.py
+++ b/examples/onnxruntime/training/question-answering/trainer_qa.py
@@ -15,7 +15,10 @@
 """
 A subclass of `ORTTrainer` specific to Question-Answering tasks
 """
-from transformers.trainer_utils import PredictionOutput
+import math
+import time
+
+from transformers.trainer_utils import PredictionOutput, speed_metrics
 
 from optimum.onnxruntime import ORTTrainer
 
@@ -26,14 +29,7 @@ def __init__(self, *args, eval_examples=None, post_process_function=None, **kwar
         self.eval_examples = eval_examples
         self.post_process_function = post_process_function
 
-    def evaluate(
-        self,
-        eval_dataset=None,
-        eval_examples=None,
-        ignore_keys=None,
-        metric_key_prefix: str = "eval",
-        inference_with_ort: bool = False,
-    ):
+    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
         eval_examples = self.eval_examples if eval_examples is None else eval_examples
@@ -41,11 +37,8 @@ def evaluate(
         # Temporarily disable metric computation, we will do it in the loop here.
         compute_metrics = self.compute_metrics
         self.compute_metrics = None
-        if inference_with_ort:
-            eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort
-        else:
-            eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        start_time = time.time()
         try:
             output = eval_loop(
                 eval_dataloader,
@@ -54,11 +47,23 @@ def evaluate(
                 # self.args.prediction_loss_only
                 prediction_loss_only=True if compute_metrics is None else None,
                 ignore_keys=ignore_keys,
+                metric_key_prefix=metric_key_prefix,
             )
         finally:
             self.compute_metrics = compute_metrics
-
-        if self.post_process_function is not None and self.compute_metrics is not None:
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
+        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
+            # Only the main node write the results by default
             eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
             metrics = self.compute_metrics(eval_preds)
 
@@ -66,31 +71,25 @@ def evaluate(
             for key in list(metrics.keys()):
                 if not key.startswith(f"{metric_key_prefix}_"):
                     metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+            metrics.update(output.metrics)
+        else:
+            metrics = output.metrics
 
+        if self.args.should_log:
+            # Only the main node log the results by default
             self.log(metrics)
-        else:
-            metrics = {}
 
         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
         return metrics
 
-    def predict(
-        self,
-        predict_dataset,
-        predict_examples,
-        ignore_keys=None,
-        metric_key_prefix: str = "test",
-        inference_with_ort: bool = False,
-    ):
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
         predict_dataloader = self.get_test_dataloader(predict_dataset)
 
         # Temporarily disable metric computation, we will do it in the loop here.
         compute_metrics = self.compute_metrics
         self.compute_metrics = None
-        if inference_with_ort:
-            eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort
-        else:
-            eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        start_time = time.time()
         try:
             output = eval_loop(
                 predict_dataloader,
@@ -99,9 +98,21 @@ def predict(
                 # self.args.prediction_loss_only
                 prediction_loss_only=True if compute_metrics is None else None,
                 ignore_keys=ignore_keys,
+                metric_key_prefix=metric_key_prefix,
             )
         finally:
             self.compute_metrics = compute_metrics
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
 
         if self.post_process_function is None or self.compute_metrics is None:
             return output
@@ -113,5 +124,5 @@ def predict(
         for key in list(metrics.keys()):
             if not key.startswith(f"{metric_key_prefix}_"):
                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
+        metrics.update(output.metrics)
         return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
diff --git a/examples/onnxruntime/training/summarization/run_summarization.py b/examples/onnxruntime/training/summarization/run_summarization.py
index d1264489d8..83ec61f225 100644
--- a/examples/onnxruntime/training/summarization/run_summarization.py
+++ b/examples/onnxruntime/training/summarization/run_summarization.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -51,7 +52,7 @@
 
 
 # Might have error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -98,12 +99,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -187,7 +204,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                 "during ``evaluate`` and ``predict``."
             )
@@ -246,14 +263,14 @@ class DataTrainingArguments:
         },
     )
     source_prefix: Optional[str] = field(
-        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
 
     forced_bos_token: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "The token to force as the first generated token after the decoder_start_token_id."
+                "The token to force as the first generated token after the decoder_start_token_id. "
                 "Useful for multilingual models like mBART where the first generated token"
                 "needs to be the target language token (Usually it is the target language token)"
             )
@@ -261,8 +278,13 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training, validation, or test file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
@@ -270,22 +292,13 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 summarization_name_mapping = {
     "amazon_reviews_multi": ("review_body", "review_title"),
     "big_patent": ("description", "abstract"),
@@ -307,15 +320,19 @@ def main():
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -327,6 +344,11 @@ def main():
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -337,7 +359,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -386,7 +408,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -403,7 +425,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -417,14 +439,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -432,11 +456,10 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
-    model.resize_token_embeddings(len(tokenizer))
-
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang]
@@ -471,10 +494,16 @@ def main():
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
         column_names = raw_datasets["train"].column_names
     elif training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
         column_names = raw_datasets["validation"].column_names
     elif training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
         column_names = raw_datasets["test"].column_names
     else:
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
@@ -520,7 +549,7 @@ def main():
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
         logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
 
@@ -550,8 +579,6 @@ def preprocess_function(examples):
         return model_inputs
 
     if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
         train_dataset = raw_datasets["train"]
         if data_args.max_train_samples is not None:
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
@@ -568,8 +595,6 @@ def preprocess_function(examples):
 
     if training_args.do_eval:
         max_target_length = data_args.val_max_target_length
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = raw_datasets["validation"]
         if data_args.max_eval_samples is not None:
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
@@ -586,8 +611,6 @@ def preprocess_function(examples):
 
     if training_args.do_predict:
         max_target_length = data_args.val_max_target_length
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
         predict_dataset = raw_datasets["test"]
         if data_args.max_predict_samples is not None:
             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
@@ -628,10 +651,10 @@ def compute_metrics(eval_preds):
         preds, labels = eval_preds
         if isinstance(preds, tuple):
             preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
         # Some simple post-processing
@@ -643,6 +666,16 @@ def compute_metrics(eval_preds):
         result["gen_len"] = np.mean(prediction_lens)
         return result
 
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = (
+        training_args.generation_max_length
+        if training_args.generation_max_length is not None
+        else data_args.val_max_target_length
+    )
+    training_args.generation_num_beams = (
+        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
+    )
+
     # Initialize our Trainer
     trainer = ORTSeq2SeqTrainer(
         model=model,
@@ -652,7 +685,6 @@ def compute_metrics(eval_preds):
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-        feature="text2text-generation",
     )
 
     # Training
@@ -677,20 +709,15 @@ def compute_metrics(eval_preds):
 
     # Evaluation
     results = {}
-    max_length = (
-        training_args.generation_max_length
-        if training_args.generation_max_length is not None
-        else data_args.val_max_target_length
-    )
-    num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(
-            max_length=max_length,
-            num_beams=num_beams,
-            metric_key_prefix="eval",
-            inference_with_ort=inference_args.inference_with_ort,
-        )
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_ds_name, eval_ds in eval_dataset.items():
+                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
+                metrics.update(dataset_metrics)
+        else:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
@@ -700,13 +727,7 @@ def compute_metrics(eval_preds):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        predict_results = trainer.predict(
-            predict_dataset,
-            metric_key_prefix="predict",
-            max_length=max_length,
-            num_beams=num_beams,
-            inference_with_ort=inference_args.inference_with_ort,
-        )
+        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict")
         metrics = predict_results.metrics
         max_predict_samples = (
             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
@@ -718,8 +739,10 @@ def compute_metrics(eval_preds):
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
+                predictions = predict_results.predictions
+                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
                 predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
                 predictions = [pred.strip() for pred in predictions]
                 output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
diff --git a/examples/onnxruntime/training/test_examples.py b/examples/onnxruntime/training/test_examples.py
deleted file mode 100644
index 8fe1de53d5..0000000000
--- a/examples/onnxruntime/training/test_examples.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import json
-import logging
-import os
-import sys
-import unittest
-from unittest.mock import patch
-
-import torch
-from transformers.file_utils import is_apex_available
-from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
-
-
-SRC_DIRS = [
-    os.path.join(os.path.dirname(__file__), dirname)
-    for dirname in [
-        "text-classification",
-        "token-classification",
-        "question-answering",
-        "translation",
-    ]
-]
-sys.path.extend(SRC_DIRS)
-if SRC_DIRS is not None:
-    import run_glue
-    import run_ner
-    import run_qa
-    import run_translation
-
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger()
-
-
-def get_results(output_dir):
-    results = {}
-    path = os.path.join(output_dir, "all_results.json")
-    if os.path.exists(path):
-        with open(path, "r") as f:
-            results = json.load(f)
-    else:
-        raise ValueError(f"can't find {path}")
-    return results
-
-
-def is_cuda_and_apex_available():
-    is_using_cuda = torch.cuda.is_available() and torch_device == "cuda"
-    return is_using_cuda and is_apex_available()
-
-
-class ExamplesTests(TestCasePlus):
-    # Text Classification Tests
-    def test_run_glue(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_glue.py
-            --model_name_or_path bert-base-uncased
-            --task_name sst2
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-            """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_glue.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
-
-    # Token Classification Tests
-    def test_run_ner(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
-        epochs = 7 if get_gpu_count() > 1 else 2
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_ner.py
-            --model_name_or_path bert-base-uncased
-            --dataset_name conll2003
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-            --num_train_epochs={epochs}
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_ner.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
-            self.assertLess(result["eval_loss"], 0.5)
-
-    # Question Answering Tests
-    def test_run_qa(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_qa.py
-            --model_name_or_path bert-base-uncased
-            --dataset_name squad
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_qa.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_f1"], 30)
-            self.assertGreaterEqual(result["eval_exact"], 30)
-
-    @slow
-    def test_run_translation(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_translation.py
-            --model_name_or_path t5-large
-            --source_lang en
-            --target_lang ro
-            --dataset_name wmt16
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --max_steps=50
-            --warmup_steps=8
-            --do_train
-            --learning_rate=3e-3
-            --per_device_train_batch_size=2
-            --per_device_eval_batch_size=1
-            --predict_with_generate
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_translation.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_bleu"], 30)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py
index 7a81a2ff15..f3f04657af 100644
--- a/examples/onnxruntime/training/text-classification/run_glue.py
+++ b/examples/onnxruntime/training/text-classification/run_glue.py
@@ -21,6 +21,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -188,12 +189,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -203,32 +220,24 @@ class ModelArguments:
     )
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -241,6 +250,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -291,7 +304,7 @@ def main():
             "glue",
             data_args.task_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -299,7 +312,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from your local files.
@@ -328,7 +341,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -336,7 +349,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -371,14 +384,16 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -386,7 +401,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -440,7 +456,7 @@ def main():
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -496,6 +512,8 @@ def preprocess_function(examples):
     # Get the metric function
     if data_args.task_name is not None:
         metric = evaluate.load("glue", data_args.task_name)
+    elif is_regression:
+        metric = evaluate.load("mse")
     else:
         metric = evaluate.load("accuracy")
 
@@ -504,17 +522,12 @@ def preprocess_function(examples):
     def compute_metrics(p: EvalPrediction):
         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        if data_args.task_name is not None:
-            result = metric.compute(predictions=preds, references=p.label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        elif is_regression:
-            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
-        else:
-            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+        result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
 
-    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to ORTTrainer, so we change it if
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
     # we already did the padding.
     if data_args.pad_to_max_length:
         data_collator = default_data_collator
@@ -532,7 +545,6 @@ def compute_metrics(p: EvalPrediction):
         compute_metrics=compute_metrics,
         tokenizer=tokenizer,
         data_collator=data_collator,
-        feature="text-classification",
     )
 
     # Training
@@ -550,6 +562,7 @@ def compute_metrics(p: EvalPrediction):
         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
 
         trainer.save_model()  # Saves the tokenizer too for easy upload
+
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
@@ -571,7 +584,7 @@ def compute_metrics(p: EvalPrediction):
             combined = {}
 
         for eval_dataset, task in zip(eval_datasets, tasks):
-            metrics = trainer.evaluate(eval_dataset=eval_dataset, inference_with_ort=inference_args.inference_with_ort)
+            metrics = trainer.evaluate(eval_dataset=eval_dataset)
 
             max_eval_samples = (
                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
@@ -599,9 +612,7 @@ def compute_metrics(p: EvalPrediction):
         for predict_dataset, task in zip(predict_datasets, tasks):
             # Removing the `label` columns because it contains -1 and Trainer won't like that.
             predict_dataset = predict_dataset.remove_columns("label")
-            predictions = trainer.predict(
-                predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort
-            ).predictions
+            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
             predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
 
             output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
diff --git a/examples/onnxruntime/training/token-classification/run_ner.py b/examples/onnxruntime/training/token-classification/run_ner.py
index 80366f07ad..55ddfa2cf0 100644
--- a/examples/onnxruntime/training/token-classification/run_ner.py
+++ b/examples/onnxruntime/training/token-classification/run_ner.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -49,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
@@ -79,12 +80,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -204,32 +221,24 @@ def __post_init__(self):
         self.task_name = self.task_name.lower()
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -242,6 +251,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -252,7 +265,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -289,7 +302,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -358,7 +371,8 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -368,7 +382,8 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -377,7 +392,8 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     model = AutoModelForTokenClassification.from_pretrained(
@@ -386,7 +402,8 @@ def get_label_list(labels):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -567,7 +584,6 @@ def compute_metrics(p):
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics,
-        feature="token-classification",
     )
 
     # Training
@@ -580,6 +596,7 @@ def compute_metrics(p):
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         metrics = train_result.metrics
         trainer.save_model()  # Saves the tokenizer too for easy upload
+
         max_train_samples = (
             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
         )
@@ -593,7 +610,7 @@ def compute_metrics(p):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        metrics = trainer.evaluate(inference_with_ort=inference_args.inference_with_ort)
+        metrics = trainer.evaluate()
 
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
@@ -605,10 +622,7 @@ def compute_metrics(p):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        predictions, labels, metrics = trainer.predict(
-            predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort
-        )
-
+        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
         predictions = np.argmax(predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/examples/onnxruntime/training/translation/run_translation.py b/examples/onnxruntime/training/translation/run_translation.py
index e410454f2f..0b6a36d12f 100644
--- a/examples/onnxruntime/training/translation/run_translation.py
+++ b/examples/onnxruntime/training/translation/run_translation.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -52,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
@@ -89,12 +90,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -156,7 +173,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                 "during ``evaluate`` and ``predict``."
             )
@@ -248,32 +265,24 @@ def __post_init__(self):
             self.val_max_target_length = self.max_target_length
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTSeq2SeqTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -286,6 +295,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -296,7 +309,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -345,7 +358,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -362,7 +375,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -376,14 +389,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -391,11 +406,10 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
-    model.resize_token_embeddings(len(tokenizer))
-
     # Set decoder_start_token_id
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
         if isinstance(tokenizer, MBartTokenizer):
@@ -448,7 +462,7 @@ def main():
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
         logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
 
@@ -549,10 +563,10 @@ def compute_metrics(eval_preds):
         preds, labels = eval_preds
         if isinstance(preds, tuple):
             preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
         # Some simple post-processing
@@ -575,7 +589,6 @@ def compute_metrics(eval_preds):
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-        feature="text2text-generation",
     )
 
     # Training
@@ -609,12 +622,7 @@ def compute_metrics(eval_preds):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        metrics = trainer.evaluate(
-            max_length=max_length,
-            num_beams=num_beams,
-            metric_key_prefix="eval",
-            inference_with_ort=inference_args.inference_with_ort,
-        )
+        metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
@@ -625,11 +633,7 @@ def compute_metrics(eval_preds):
         logger.info("*** Predict ***")
 
         predict_results = trainer.predict(
-            predict_dataset,
-            metric_key_prefix="predict",
-            max_length=max_length,
-            num_beams=num_beams,
-            inference_with_ort=inference_args.inference_with_ort,
+            predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams
         )
         metrics = predict_results.metrics
         max_predict_samples = (
@@ -642,8 +646,10 @@ def compute_metrics(eval_preds):
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
+                predictions = predict_results.predictions
+                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
                 predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
                 predictions = [pred.strip() for pred in predictions]
                 output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 89363b6b26..afc90e405b 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -22,8 +22,7 @@
 import time
 import types
 import warnings
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 
 # Integrations must be imported before ML frameworks:
@@ -49,43 +48,27 @@
 
 # isort: on
 
-import numpy as np
+import huggingface_hub.utils as hf_hub_utils
 import torch
 import torch.distributed as dist
 from torch import nn
-from torch.utils.data import DataLoader, Dataset
+from torch.utils.data import Dataset, RandomSampler
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
-from transformers.file_utils import (
-    is_apex_available,
-    is_sagemaker_dp_enabled,
-    is_sagemaker_mp_enabled,
-    is_torch_tpu_available,
-)
 from transformers.modeling_utils import PreTrainedModel, unwrap_model
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from transformers.trainer_callback import TrainerCallback, TrainerState
 from transformers.trainer_pt_utils import (
-    DistributedTensorGatherer,
-    IterableDatasetShard,
-    SequentialDistributedSampler,
-    find_batch_size,
     get_model_param_count,
     get_module_class_from_name,
     get_parameter_names,
-    nested_concat,
-    nested_detach,
-    nested_numpify,
 )
 from transformers.trainer_utils import (
-    EvalLoopOutput,
     EvalPrediction,
     HPSearchBackend,
-    PredictionOutput,
     TrainOutput,
-    denumpify_detensorize,
     enable_full_determinism,
     find_executable_batch_size,
     get_last_checkpoint,
@@ -94,31 +77,17 @@
     speed_metrics,
 )
 from transformers.training_args import ParallelMode
+from transformers.utils import (
+    is_apex_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_torch_tpu_available,
+)
 
-from ..exporters import TasksManager
-from ..exporters.onnx import OnnxConfigWithPast, export, export_models, get_decoder_models_for_export
 from ..utils import logging
-from .modeling_decoder import ORTModelForCausalLM
-from .modeling_ort import (
-    ORTModel,
-    ORTModelForCustomTasks,
-    ORTModelForFeatureExtraction,
-    ORTModelForImageClassification,
-    ORTModelForMaskedLM,
-    ORTModelForMultipleChoice,
-    ORTModelForQuestionAnswering,
-    ORTModelForSemanticSegmentation,
-    ORTModelForSequenceClassification,
-    ORTModelForTokenClassification,
-)
-from .modeling_seq2seq import ORTModelForSeq2SeqLM, ORTModelForSpeechSeq2Seq
 from .training_args import ORTOptimizerNames, ORTTrainingArguments
 from .utils import (
-    ONNX_DECODER_NAME,
-    ONNX_DECODER_WITH_PAST_NAME,
-    ONNX_WEIGHTS_NAME,
     is_onnxruntime_training_available,
-    wrap_onnx_config_for_loss,
 )
 
 
@@ -132,14 +101,10 @@
     import optuna
 
 
-logger = logging.get_logger(__name__)
-
 # Name of the files used for checkpointing
-TRAINING_ARGS_NAME = "training_args.bin"
 TRAINER_STATE_NAME = "trainer_state.json"
-OPTIMIZER_NAME = "optimizer.pt"
-SCHEDULER_NAME = "scheduler.pt"
-SCALER_NAME = "scaler.pt"
+
+logger = logging.get_logger(__name__)
 
 
 class ModuleWithLoss(nn.Module):
@@ -168,44 +133,6 @@ def config(self):
         return self._original_model.config
 
 
-class ORTFeaturesManager:
-    _TASKS_TO_ORTMODELS = {
-        "feature-extraction": ORTModelForFeatureExtraction,
-        "fill-mask": ORTModelForMaskedLM,
-        "text-generation": ORTModelForCausalLM,
-        "text-generation-with-past": ORTModelForCausalLM,
-        "text2text-generation": ORTModelForSeq2SeqLM,
-        "text2text-generation-with-past": ORTModelForSeq2SeqLM,
-        "text-classification": ORTModelForSequenceClassification,
-        "token-classification": ORTModelForTokenClassification,
-        "multiple-choice": ORTModelForMultipleChoice,
-        "question-answering": ORTModelForQuestionAnswering,
-        "image-classification": ORTModelForImageClassification,
-        "semantic-segmentation": ORTModelForSemanticSegmentation,
-        "automatic-speech-recognition": ORTModelForSpeechSeq2Seq,
-    }
-
-    SUPPORTED_FEATURES = _TASKS_TO_ORTMODELS.keys()
-
-    @staticmethod
-    def get_model_class_for_feature(feature: str) -> Type:
-        """
-        Gets the subclass of `ORTModel` associated with the feature.
-        """
-
-        return ORTFeaturesManager._TASKS_TO_ORTMODELS[feature]
-
-    @staticmethod
-    def do_use_cache(feature: str) -> bool:
-        """
-        Gets the value of `use_cache` for the feature.
-        """
-        if "-with-past" in feature:
-            return True
-        else:
-            return False
-
-
 class ORTTrainer(Trainer):
     """
     ORTTrainer is a simple but feature-complete training and eval loop for ONNX Runtime, optimized for 🤗 Transformers.
@@ -286,18 +213,16 @@ class ORTTrainer(Trainer):
     def __init__(
         self,
         model: Union[PreTrainedModel, nn.Module] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        feature: str = "feature-extraction",
         args: ORTTrainingArguments = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        onnx_model_path: Union[str, os.PathLike] = None,
     ):
         super().__init__(
             model=model,
@@ -323,9 +248,6 @@ def __init__(
 
         self.model = model
 
-        self.feature = feature
-        self.onnx_model_path = onnx_model_path
-        self.exported_with_loss = False
         if self.args.local_rank:
             torch.cuda.set_device(self.args.local_rank)
 
@@ -437,7 +359,12 @@ def train(
             if resume_from_checkpoint is None:
                 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
 
-        if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and args.deepspeed is None:
+        if (
+            resume_from_checkpoint is not None
+            and not is_sagemaker_mp_enabled()
+            and not self.is_deepspeed_enabled
+            and not self.is_fsdp_enabled
+        ):
             self._load_from_checkpoint(resume_from_checkpoint)
 
         # If model was re-initialized, put it on the right device and update self.model_wrapped
@@ -449,12 +376,25 @@ def train(
         inner_training_loop = find_executable_batch_size(
             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
         )
-        return inner_training_loop(
-            args=args,
-            resume_from_checkpoint=resume_from_checkpoint,
-            trial=trial,
-            ignore_keys_for_eval=ignore_keys_for_eval,
-        )
+        if args.push_to_hub:
+            try:
+                # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
+                hf_hub_utils.disable_progress_bars()
+                return inner_training_loop(
+                    args=args,
+                    resume_from_checkpoint=resume_from_checkpoint,
+                    trial=trial,
+                    ignore_keys_for_eval=ignore_keys_for_eval,
+                )
+            finally:
+                hf_hub_utils.enable_progress_bars()
+        else:
+            return inner_training_loop(
+                args=args,
+                resume_from_checkpoint=resume_from_checkpoint,
+                trial=trial,
+                ignore_keys_for_eval=ignore_keys_for_eval,
+            )
 
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
@@ -504,14 +444,6 @@ def _inner_training_loop(
                 f" {args.max_steps}"
             )
 
-        # Compute absolute values for logging, eval, and save if given as ratio
-        if args.logging_steps and args.logging_steps < 1:
-            args.logging_steps = math.ceil(max_steps * args.logging_steps)
-        if args.eval_steps and args.eval_steps < 1:
-            args.eval_steps = math.ceil(max_steps * args.eval_steps)
-        if args.save_steps and args.save_steps < 1:
-            args.save_steps = math.ceil(max_steps * args.save_steps)
-
         if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
             if self.args.n_gpu > 1:
                 # nn.DataParallel(model) replicates the model, creating new variables and module
@@ -556,13 +488,30 @@ def _inner_training_loop(
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
         # Activate gradient checkpointing if needed
         if args.gradient_checkpointing:
             self.model.gradient_checkpointing_enable()
 
         model = self._wrap_model(self.model_wrapped)  # Wrap unless the ORTModule is already wrapped, eg. wrap DDP
 
-        if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
+        if (is_sagemaker_mp_enabled() or self.is_fsdp_enabled) and resume_from_checkpoint is not None:
             self._load_from_checkpoint(resume_from_checkpoint, model)
 
         # as the model is wrapped, don't use `accelerator.prepare`
@@ -688,11 +637,27 @@ def _inner_training_loop(
 
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
+        # Temp: remove after transformers 4.34 release
+        def get_dataloader_sampler(dataloader):
+            if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
+                return get_dataloader_sampler(dataloader.batch_sampler)
+            elif hasattr(dataloader, "sampler"):
+                return dataloader.sampler
+
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                for _ in train_dataloader:
-                    break
+                sampler = get_dataloader_sampler(train_dataloader)
+                is_random_sampler = isinstance(sampler, RandomSampler)
+                if not is_random_sampler:
+                    # We just need to begin an iteration to create the randomization of the sampler.
+                    for _ in train_dataloader:
+                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
@@ -703,7 +668,7 @@ def _inner_training_loop(
                 self._past = None
 
             steps_in_epoch = (
-                len(train_dataloader)
+                len(epoch_iterator)
                 if len_dataloader is not None
                 else args.max_steps * args.gradient_accumulation_steps
             )
@@ -715,13 +680,13 @@ def _inner_training_loop(
             rng_to_sync = False
             steps_skipped = 0
             if steps_trained_in_current_epoch > 0:
-                skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
                 steps_skipped = steps_trained_in_current_epoch
                 steps_trained_in_current_epoch = 0
                 rng_to_sync = True
 
             step = -1
-            for step, inputs in enumerate(train_dataloader):
+            for step, inputs in enumerate(epoch_iterator):
                 total_batched_samples += 1
                 if rng_to_sync:
                     self._load_rng_state(resume_from_checkpoint)
@@ -864,759 +829,16 @@ def _inner_training_loop(
         # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
         if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
             for checkpoint in checkpoints_sorted:
-                if checkpoint != self.state.best_model_checkpoint:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                     logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                     shutil.rmtree(checkpoint)
 
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
-        return TrainOutput(self.state.global_step, train_loss, metrics)
-
-    def evaluate(
-        self,
-        eval_dataset: Optional[Dataset] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-        inference_with_ort: bool = False,
-    ) -> Dict[str, float]:
-        """
-        Run evaluation with ONNX Runtime or PyTorch backend and returns metrics.
-
-        Args:
-            eval_dataset (`Dataset`, *optional*):
-                Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
-                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
-                method.
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is "eval" (default)
-
-        Returns:
-            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
-            dictionary also contains the epoch number which comes from the training state.
-        """
-        # memory metrics - must set up as early as possible
-        # TODO: We need to enable evaluation using ORT backend.
-        if self.args.use_module_with_loss:
-            self.model = self.model._original_model
-        self._memory_tracker.start()
-
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        start_time = time.time()
-
-        if inference_with_ort:
-            logger.info("[INFO] Evaluating with ONNX Runtime backend.")
-            eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort
-        else:
-            logger.info(
-                "[INFO] Evaluating with PyTorch backend. If you want to use ONNX Runtime for the evaluation, set `trainer.evaluate(inference_with_ort=True)`."
-            )
-            eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-
-        try:
-            output = eval_loop(
-                eval_dataloader,
-                description="Evaluation",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if self.compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
-            )
-        except Exception as error:
-            logger.error(error)
-            if inference_with_ort:
-                logger.error(
-                    f"[ERROR!] Evaluation with ONNX Runtime is not available for {self.model.config.name_or_path} model. Set `inference_with_ort=False` to evaluate with PyTorch."
-                )
-            raise
-
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-
-        self.log(output.metrics)
-
-        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
-
-        self._memory_tracker.stop_and_update_metrics(output.metrics)
-
-        return output.metrics
-
-    def predict(
-        self,
-        test_dataset: Dataset,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "test",
-        inference_with_ort: bool = False,
-    ) -> PredictionOutput:
-        """
-        Run prediction and returns predictions and potential metrics.
-
-        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in `evaluate()`.
-
-        Args:
-            test_dataset (`Dataset`):
-                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
-                `model.forward()` method are automatically removed. Has to implement the method `__len__`
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "test_bleu" if the prefix is "test" (default)
-
-        <Tip>
-
-        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
-        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
-        one array. The padding index is -100.
-
-        </Tip>
-
-        Returns: *NamedTuple* A namedtuple with the following keys:
-
-            - predictions (`np.ndarray`): The predictions on `test_dataset`.
-            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
-            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
-              labels).
-        """
-        # TODO: We need to enable evaluation using ORT backend.
-        if self.args.use_module_with_loss:
-            self.model = self.model._original_model
-
-        # memory metrics - must set up as early as possible
-        self._memory_tracker.start()
-
-        test_dataloader = self.get_test_dataloader(test_dataset)
-        start_time = time.time()
-
-        if inference_with_ort:
-            logger.info("[INFO] Predicting with ONNX Runtime backend.")
-            eval_loop = self.prediction_loop_ort if self.args.use_legacy_prediction_loop else self.evaluation_loop_ort
-        else:
-            logger.info(
-                "[INFO] Predicting with PyTorch backend. If you want to use ONNX Runtime for the prediction, set `trainer.predict(inference_with_ort=True)`."
-            )
-            eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-
-        try:
-            output = eval_loop(
-                test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
-            )
-        except Exception as error:
-            logger.error(error)
-            if inference_with_ort:
-                logger.error(
-                    f"[ERROR!] Prediction with ONNX Runtime is not available for {self.model.config.name_or_path} model. Set `inference_with_ort=False` to predict with PyTorch."
-                )
-            raise
-
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-
-        self._memory_tracker.stop_and_update_metrics(output.metrics)
-
-        return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
-
-    def evaluation_loop_ort(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`.
-
-        Works both with or without labels.
-        """
-        logger.info("[INFO] ONNX Runtime inference starts...")
-
-        # Check if there are labels in the dataset
-        dummy_inputs = next(iter(dataloader))
-        has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names)
-        use_cache = ORTFeaturesManager.do_use_cache(self.feature)
-
-        if self.onnx_model_path and (has_labels == self.exported_with_loss):
-            logger.info("[INFO] Inference with given ONNX model")
-            self.onnx_model_path = Path(self.onnx_model_path).as_posix()
-        else:
-            onnx_model_path = Path(self.args.output_dir)
-
-            logger.info("[INFO] Exporting the model to ONNX...")
-            if self.args.deepspeed and self.args.fp16:
-                export_device = "cuda"
-            else:
-                export_device = "cpu"
-
-            # With `label_smoother` the loss will be computed outside modeling
-            with_loss = has_labels and not self.label_smoother
-            self._export(onnx_model_path, with_loss=with_loss, device=export_device, use_cache=use_cache)
-
-            self.exported_with_loss = with_loss
-            self.onnx_model_path = onnx_model_path.as_posix()
-            logger.info(f"[INFO] ONNX model is stored in: {self.onnx_model_path}")
-
-        # Load ORT model
-        support_loss_in_modeling = self.feature in [
-            "text-generation",
-            "text-generation-with-past",
-            "text2text-generation",
-            "text2text-generation-with-past",
-        ]
-        support_feature = self.feature in ORTFeaturesManager.SUPPORTED_FEATURES
-        if support_loss_in_modeling or (not self.exported_with_loss and support_feature):
-            # Exported with standard outputs, use specific ORTModels
-            ort_model_cls = ORTFeaturesManager.get_model_class_for_feature(self.feature)
-        else:
-            ort_model_cls = ORTModelForCustomTasks
-
-        model_id = self.onnx_model_path
-        args = self.args
-        if ort_model_cls is ORTModelForCausalLM:
-            ort_model = ort_model_cls.from_pretrained(model_id=model_id, use_cache=use_cache).to(args.device)
-        else:
-            ort_model = ort_model_cls.from_pretrained(model_id=model_id).to(args.device)
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        batch_size = dataloader.batch_size
-
-        logger.info(f"***** Running {description} *****")
-        if has_length(dataloader):
-            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
-        else:
-            logger.info("  Num examples: Unknown")
-        logger.info(f"  Batch size = {batch_size}")
-
-        self.callback_handler.eval_dataloader = dataloader
-        # Do this before wrapping.
-        eval_dataset = getattr(dataloader, "dataset", None)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        # Initialize containers
-        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
-        losses_host = None
-        preds_host = None
-        labels_host = None
-        inputs_host = None
-
-        # losses/preds/labels on CPU (final containers)
-        all_losses = None
-        all_preds = None
-        all_labels = None
-        all_inputs = None
-        # Will be useful when we have an iterable dataset so don't know its length.
-
-        observed_num_examples = 0
-        # Main evaluation loop
-        for step, inputs in enumerate(dataloader):
-            # Update the observed num examples
-            observed_batch_size = find_batch_size(inputs)
-            if observed_batch_size is not None:
-                observed_num_examples += observed_batch_size
-                # For batch samplers, batch_size is not known by the dataloader in advance.
-                if batch_size is None:
-                    batch_size = observed_batch_size
-
-            # Prediction step(send also onnxruntime inference session)
-            loss, logits, labels = self.prediction_step_ort(
-                ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys
-            )
-            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
-
-            # Update containers on host
-            if loss is not None:
-                losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
-                losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
-            if labels is not None:
-                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
-            if inputs_decode is not None:
-                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
-                inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            if logits is not None:
-                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
-                if self.preprocess_logits_for_metrics is not None:
-                    logits = self.preprocess_logits_for_metrics(logits, labels)
-                logits = self.accelerator.gather_for_metrics((logits))
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-
-            if labels is not None:
-                labels = self.accelerator.gather_for_metrics((labels))
-            labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and self.accelerator.sync_gradients:
-                if losses_host is not None:
-                    losses = nested_numpify(losses_host)
-                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-                if preds_host is not None:
-                    logits = nested_numpify(preds_host)
-                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-                if inputs_host is not None:
-                    inputs_decode = nested_numpify(inputs_host)
-                    all_inputs = (
-                        inputs_decode
-                        if all_inputs is None
-                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-                    )
-                if labels_host is not None:
-                    labels = nested_numpify(labels_host)
-                    all_labels = (
-                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-                    )
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        if losses_host is not None:
-            losses = nested_numpify(losses_host)
-            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-        if preds_host is not None:
-            logits = nested_numpify(preds_host)
-            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-        if inputs_host is not None:
-            inputs_decode = nested_numpify(inputs_host)
-            all_inputs = (
-                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-            )
-        if labels_host is not None:
-            labels = nested_numpify(labels_host)
-            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-
-        # Number of samples
-        if has_length(eval_dataset):
-            num_samples = len(eval_dataset)
-        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
-        # methods. Therefore we need to make sure it also has the attribute.
-        elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
-            num_samples = eval_dataset.num_examples
-        else:
-            if has_length(dataloader):
-                num_samples = self.num_examples(dataloader)
-            else:  # both len(dataloader.dataset) and len(dataloader) fail
-                num_samples = observed_num_examples
-        if num_samples == 0 and observed_num_examples > 0:
-            num_samples = observed_num_examples
-
-        # Metrics!
-        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
 
-        if all_losses is not None:
-            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
-        if hasattr(self, "jit_compilation_time"):
-            metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
-
-    def prediction_loop_ort(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`.
-
-        Works both with or without labels.
-        """
-        logger.info("[INFO] ONNX Runtime inference starts...")
-
-        # Check if there are labels in the dataset
-        dummy_inputs = next(iter(dataloader))
-        has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names)
-        use_cache = ORTFeaturesManager.do_use_cache(self.feature)
-
-        if self.onnx_model_path and (has_labels == self.exported_with_loss):
-            logger.info("[INFO] Inference with given ONNX model")
-            self.onnx_model_path = Path(self.onnx_model_path).as_posix()
-        else:
-            onnx_model_path = Path(self.args.output_dir)
-
-            logger.info("[INFO] Exporting the model to ONNX...")
-            if self.args.deepspeed and self.args.fp16:
-                export_device = "cuda"
-            else:
-                export_device = "cpu"
-
-            # With `label_smoother` the loss will be computed outside modeling
-            with_loss = has_labels and not self.label_smoother
-            self._export(onnx_model_path, with_loss=with_loss, device=export_device, use_cache=use_cache)
-
-            self.exported_with_loss = with_loss
-            self.onnx_model_path = onnx_model_path.as_posix()
-            logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path)
-
-        # Load ORT model
-        support_loss_in_modeling = self.feature in [
-            "text-generation",
-            "text-generation-with-past",
-            "text2text-generation",
-            "text2text-generation-with-past",
-        ]
-        support_feature = self.feature in ORTFeaturesManager.SUPPORTED_FEATURES
-        if support_loss_in_modeling or (not self.exported_with_loss and support_feature):
-            # Exported with standard outputs, use specific ORTModels
-            ort_model_cls = ORTFeaturesManager.get_model_class_for_feature(self.feature)
-        else:
-            ort_model_cls = ORTModelForCustomTasks
-
-        model_id = self.onnx_model_path
-        args = self.args
-        if ort_model_cls is ORTModelForCausalLM:
-            ort_model = ort_model_cls.from_pretrained(model_id=model_id, use_cache=use_cache).to(args.device)
-        else:
-            ort_model = ort_model_cls.from_pretrained(model_id=model_id).to(args.device)
-
-        if not has_length(dataloader):
-            raise ValueError("dataloader must implement a working __len__")
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        batch_size = dataloader.batch_size
-        num_examples = self.num_examples(dataloader)
-        logger.info(f"***** Running {description} *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Batch size = {batch_size}")
-        losses_host: torch.Tensor = None
-        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
-
-        world_size = max(1, args.world_size)
-
-        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
-        if not prediction_loss_only:
-            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
-            # a batch size to the sampler)
-            make_multiple_of = None
-            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
-                make_multiple_of = dataloader.sampler.batch_size
-            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        self.callback_handler.eval_dataloader = dataloader
-
-        for step, inputs in enumerate(dataloader):
-            loss, logits, labels = self.prediction_step_ort(
-                ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys
-            )
-            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
-
-            if loss is not None:
-                losses = loss.repeat(batch_size)
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
-            if logits is not None:
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-            if labels is not None:
-                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-            if inputs_decode is not None:
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
-                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-                if not prediction_loss_only:
-                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-        if not prediction_loss_only:
-            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-        eval_loss = eval_losses_gatherer.finalize()
-        preds = preds_gatherer.finalize() if not prediction_loss_only else None
-        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
-        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
-
-        if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if eval_loss is not None:
-            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples)
-
-    def prediction_step_ort(
-        self,
-        model: ORTModel,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-
-        Args:
-            model (`ORTModel`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-
-        Return:
-            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
-            logits and labels (each being optional).
-        """
-        has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
-        # For CLIP-like models capable of returning loss values.
-        # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
-        # is `True` in `model.forward`.
-        return_loss = inputs.get("return_loss", None)
-        if return_loss is None:
-            return_loss = self.can_return_loss
-        loss_without_labels = True if len(self.label_names) == 0 and return_loss else False
-
-        inputs = self._prepare_inputs(inputs)
-
-        if ignore_keys is None:
-            if hasattr(self.model, "config"):
-                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
-            else:
-                ignore_keys = []
-
-        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
-        if has_labels or loss_without_labels:
-            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
-            if len(labels) == 1:
-                labels = labels[0]
-        else:
-            labels = None
-
-        with torch.no_grad():
-            if is_sagemaker_mp_enabled():
-                raise NotImplementedError(
-                    "Sagemaker's distributed data parallel features are not supported by `ORTTrainer` yet."
-                )
-            else:
-                if has_labels or loss_without_labels:
-                    with self.compute_loss_context_manager():
-                        loss, outputs = self.compute_loss_ort(model, inputs, return_outputs=True)
-                    loss = loss.mean().detach()
-
-                    if isinstance(outputs, dict):
-                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
-                    else:
-                        logits = outputs[1:]
-                else:
-                    loss = None
-                    with self.compute_loss_context_manager():
-                        outputs = model(**inputs)
-                    if isinstance(outputs, dict):
-                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
-                    else:
-                        logits = outputs
-                    # TODO: this needs to be fixed and made cleaner later.
-                    if self.args.past_index >= 0:
-                        self._past = outputs[self.args.past_index - 1]
-
-        if prediction_loss_only:
-            return (loss, None, None)
-
-        logits = nested_detach(logits)
-        if len(logits) == 1:
-            logits = logits[0]
-
-        return (loss, logits, labels)
-
-    def compute_loss_ort(self, model, inputs, return_outputs=False):
-        """
-        How the loss is computed by ORTTrainer. By default, all models return the loss in the first element.
-        Subclass and override for custom behavior.
-        """
-        if self.label_smoother is not None and "labels" in inputs:
-            labels = inputs.pop("labels")
-        else:
-            labels = None
-        outputs = model(**inputs)
-        # Save past state if it exists
-        # TODO: this needs to be fixed and made cleaner later.
-        if self.args.past_index >= 0:
-            self._past = outputs[self.args.past_index]
-
-        if labels is not None:
-            if "text-generation" in self.feature:
-                loss = self.label_smoother(outputs, labels, shift_labels=True)
-            else:
-                loss = self.label_smoother(outputs, labels)
-        else:
-            if isinstance(outputs, dict) and "loss" not in outputs:
-                raise ValueError(
-                    "The model did not return a loss from the inputs, only the following keys: "
-                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                )
-            # We don't use .loss here since the model may return tuples instead of ModelOutput.
-            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
-
-        return (loss, outputs) if return_outputs else loss
-
-    def _export(
-        self,
-        model_path: os.PathLike,
-        model: Optional[PreTrainedModel] = None,
-        opset: Optional[int] = None,
-        device: str = "cpu",
-        with_loss: bool = True,
-        use_cache: bool = False,
-    ) -> None:
-        """
-        Load and export a model to an ONNX format.
-
-        Args:
-            model_path (`os.PathLike`):
-                The path used to save the model exported to an ONNX format.
-            model ([`PreTrainedModel`], *optional*):
-                The model to export. If not provided, a `model_path` must be passed.
-            opset (`int`, *optional*):
-                ONNX opset version to export the model with.
-            device (`str`, *optional*, defaults to `cpu`):
-                The device on which the ONNX model will be exported. Either `cpu` or `cuda`.
-            with_loss (`bool`, defaults to `True`):
-                Whether to export ONNX model with the loss in outputs.
-        """
-        if model is None:
-            if not (self.args.fp16 and self.args.deepspeed):
-                # Taking CPU to export the model
-                self.model.to("cpu")
-            model = unwrap_model(self.model)
-
-        onnx_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=model, exporter="onnx", task=self.feature
-        )
-        onnx_config = onnx_config_constructor(model.config)
-        opset = onnx_config.DEFAULT_ONNX_OPSET if opset is None else opset
-
-        is_decoder = isinstance(onnx_config, OnnxConfigWithPast)
-
-        if is_decoder:
-            output_names = [ONNX_DECODER_NAME]
-            if use_cache is True:
-                output_names.append(ONNX_DECODER_WITH_PAST_NAME)
-
-            models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
-            if with_loss is True:
-                opset = max(opset, 12)
-                models_and_onnx_configs_with_loss = {}
-                for decoder_name, (decoder, decoder_config) in models_and_onnx_configs.items():
-                    models_and_onnx_configs_with_loss[decoder_name] = (
-                        decoder,
-                        wrap_onnx_config_for_loss(decoder_config),
-                    )
-
-            export_models(
-                models_and_onnx_configs=models_and_onnx_configs_with_loss if with_loss else models_and_onnx_configs,
-                opset=opset,
-                output_dir=model_path,
-                output_names=output_names,
-                device=device,
-                disable_dynamic_axes_fix=True,  # onnxruntime floating point exception (core dumped)
-            )
-        else:
-            if with_loss is True:
-                onnx_config = wrap_onnx_config_for_loss(onnx_config)
-                opset = max(opset, 12)  # Operators like `nll_loss`are added for opset>=12
-
-            output_path = model_path / ONNX_WEIGHTS_NAME
-            _ = export(model=model, config=onnx_config, opset=opset, output=output_path, device=device)
-
-        model.config.save_pretrained(model_path)
+        return TrainOutput(self.state.global_step, train_loss, metrics)
 
     def _wrap_model(self, model, training=True, dataloader=None):
         # TODO: ipex only works with inference with PyTorch, will move `inference_with_ort` to training arguments and
@@ -1674,18 +896,24 @@ def _wrap_model(self, model, training=True, dataloader=None):
 
             auto_wrap_policy = None
             auto_wrapper_callable = None
-            if self.args.fsdp_config["fsdp_min_num_params"] > 0:
+            default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None)
+            fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get(
+                "transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap
+            )
+
+            if self.args.fsdp_config["min_num_params"] > 0:
                 auto_wrap_policy = functools.partial(
-                    size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
+                    size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["min_num_params"]
                 )
-            elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+            elif fsdp_transformer_layer_cls_to_wrap is not None:
                 transformer_cls_to_wrap = set()
-                for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
+                for layer_class in fsdp_transformer_layer_cls_to_wrap:
                     transformer_cls = get_module_class_from_name(model, layer_class)
                     if transformer_cls is None:
                         raise Exception("Could not find the transformer layer class to wrap in the model.")
                     else:
                         transformer_cls_to_wrap.add(transformer_cls)
+
                 auto_wrap_policy = functools.partial(
                     transformer_auto_wrap_policy,
                     # Transformer layer class to wrap
diff --git a/optimum/onnxruntime/trainer_seq2seq.py b/optimum/onnxruntime/trainer_seq2seq.py
index a281462fde..2e43ee89e0 100644
--- a/optimum/onnxruntime/trainer_seq2seq.py
+++ b/optimum/onnxruntime/trainer_seq2seq.py
@@ -14,40 +14,25 @@
 """
 The ORTSeq2SeqTrainer class, to easily train a sequence to sequence model in 🤗 Transformers from scratch or finetune it on a new task with ONNX Runtime.
 """
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 from torch import nn
-from torch.utils.data import DataLoader, Dataset
+from torch.utils.data import Dataset
 from transformers.deepspeed import is_deepspeed_zero3_enabled
-from transformers.modeling_utils import PreTrainedModel, unwrap_model
-from transformers.trainer_pt_utils import (
-    DistributedTensorGatherer,
-    IterableDatasetShard,
-    SequentialDistributedSampler,
-    find_batch_size,
-    nested_concat,
-    nested_numpify,
-)
-from transformers.trainer_utils import (
-    EvalLoopOutput,
-    EvalPrediction,
-    PredictionOutput,
-    denumpify_detensorize,
-    has_length,
-)
-from transformers.utils import logging
-
-from ..exporters import TasksManager
-from ..exporters.onnx import export
-from .modeling_ort import ORTModel
-from .modeling_seq2seq import ORTModelForSeq2SeqLM
+from transformers.trainer_utils import PredictionOutput
+from transformers.utils import is_accelerate_available, logging
+
 from .trainer import ORTTrainer
-from .utils import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME, wrap_onnx_config_for_loss
 
 
+if is_accelerate_available():
+    pass
+else:
+    raise ImportError(
+        "The package `accelerate` is required to use the ORTTrainer. Please install it following https://huggingface.co/docs/accelerate/basic_tutorials/install."
+    )
+
 logger = logging.get_logger(__name__)
 
 
@@ -57,15 +42,19 @@ def evaluate(
         eval_dataset: Optional[Dataset] = None,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "eval",
-        inference_with_ort: bool = False,
         **gen_kwargs,
     ) -> Dict[str, float]:
         """
-        Run evaluation with ONNX Runtime or PyTorch backend and returns metrics.
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
 
         Args:
             eval_dataset (`Dataset`, *optional*):
-                Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
                 not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                 method.
             ignore_keys (`List[str]`, *optional*):
@@ -73,7 +62,14 @@ def evaluate(
                 gathering predictions.
             metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is "eval" (default)
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
 
         Returns:
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
@@ -81,26 +77,26 @@ def evaluate(
         """
 
         gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
+        if (
+            gen_kwargs.get("max_length") is None
+            and gen_kwargs.get("max_new_tokens") is None
+            and self.args.generation_max_length is not None
+        ):
             gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
         self._gen_kwargs = gen_kwargs
 
-        return super().evaluate(
-            eval_dataset,
-            ignore_keys=ignore_keys,
-            metric_key_prefix=metric_key_prefix,
-            inference_with_ort=inference_with_ort,
-        )
+        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
 
     def predict(
         self,
         test_dataset: Dataset,
         ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-        inference_with_ort: bool = False,
+        metric_key_prefix: str = "test",
         **gen_kwargs,
     ) -> "PredictionOutput":
         """
@@ -111,20 +107,27 @@ def predict(
 
         Args:
             test_dataset (`Dataset`):
-                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
                 `model.forward()` method are automatically removed. Has to implement the method `__len__`
             ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "test_bleu" if the prefix is "test" (default)
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
 
         <Tip>
 
-        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
-        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
-        one array. The padding index is -100.
+        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
 
         </Tip>
 
@@ -137,486 +140,20 @@ def predict(
         """
 
         gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
-        self._gen_kwargs = gen_kwargs
-
-        return super().predict(
-            test_dataset,
-            ignore_keys=ignore_keys,
-            metric_key_prefix=metric_key_prefix,
-            inference_with_ort=inference_with_ort,
-        )
-
-    def evaluation_loop_ort(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`.
-
-        Works both with or without labels.
-        """
-        logger.info("[INFO] ONNX Runtime inference starts...")
-        self.ort_model = None
-
-        # Check if there are labels in the dataset
-        dummy_inputs = next(iter(dataloader))
-        has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names)
-
-        # Export ONNX models
-        if self.onnx_model_path and (has_labels == self.exported_with_loss):
-            logger.info("[INFO] Inference with given ONNX model")
-            self.onnx_model_path = Path(self.onnx_model_path).as_posix()
-        else:
-            onnx_model_path = Path(self.args.output_dir)
-            logger.info("[INFO] Exporting the model to ONNX...")
-            if self.args.deepspeed and self.args.fp16:
-                export_device = "cuda"
-            else:
-                export_device = "cpu"
-
-            with_loss = has_labels and not self.label_smoother
-            # Only need to export decoders if the models have been exported before.
-            decoders_only = True if self.onnx_model_path else False
-            self._export(onnx_model_path, with_loss=with_loss, device=export_device, decoders_only=decoders_only)
-
-            self.exported_with_loss = with_loss
-            self.onnx_model_path = onnx_model_path.as_posix()
-            logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path)
-
-        args = self.args
-        # Load ORT model
-        self.ort_model = ORTModelForSeq2SeqLM.from_pretrained(model_id=self.onnx_model_path).to(args.device)
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        batch_size = dataloader.batch_size
-
-        logger.info(f"***** Running {description} *****")
-        if has_length(dataloader):
-            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
-        else:
-            logger.info("  Num examples: Unknown")
-        logger.info(f"  Batch size = {batch_size}")
-
-        self.callback_handler.eval_dataloader = dataloader
-        # Do this before wrapping.
-        eval_dataset = getattr(dataloader, "dataset", None)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        # Initialize containers
-        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
-        losses_host = None
-        preds_host = None
-        labels_host = None
-        inputs_host = None
-
-        # losses/preds/labels on CPU (final containers)
-        all_losses = None
-        all_preds = None
-        all_labels = None
-        all_inputs = None
-        # Will be useful when we have an iterable dataset so don't know its length.
-
-        observed_num_examples = 0
-        # Main evaluation loop
-        for step, inputs in enumerate(dataloader):
-            # Update the observed num examples
-            observed_batch_size = find_batch_size(inputs)
-            if observed_batch_size is not None:
-                observed_num_examples += observed_batch_size
-                # For batch samplers, batch_size is not known by the dataloader in advance.
-                if batch_size is None:
-                    batch_size = observed_batch_size
-
-            # Prediction step(send also onnxruntime inference session)
-            loss, logits, labels = self.prediction_step_ort(
-                self.ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys
-            )
-            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
-
-            # Update containers on host
-            if loss is not None:
-                losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
-                losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
-            if labels is not None:
-                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
-            if inputs_decode is not None:
-                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
-                inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            if logits is not None:
-                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
-                if self.preprocess_logits_for_metrics is not None:
-                    logits = self.preprocess_logits_for_metrics(logits, labels)
-                logits = self.accelerator.gather_for_metrics((logits))
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-
-            if labels is not None:
-                labels = self.accelerator.gather_for_metrics((labels))
-            labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and self.accelerator.sync_gradients:
-                if losses_host is not None:
-                    losses = nested_numpify(losses_host)
-                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-                if preds_host is not None:
-                    logits = nested_numpify(preds_host)
-                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-                if inputs_host is not None:
-                    inputs_decode = nested_numpify(inputs_host)
-                    all_inputs = (
-                        inputs_decode
-                        if all_inputs is None
-                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-                    )
-                if labels_host is not None:
-                    labels = nested_numpify(labels_host)
-                    all_labels = (
-                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-                    )
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        if losses_host is not None:
-            losses = nested_numpify(losses_host)
-            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-        if preds_host is not None:
-            logits = nested_numpify(preds_host)
-            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-        if inputs_host is not None:
-            inputs_decode = nested_numpify(inputs_host)
-            all_inputs = (
-                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-            )
-        if labels_host is not None:
-            labels = nested_numpify(labels_host)
-            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-
-        # Number of samples
-        if has_length(eval_dataset):
-            num_samples = len(eval_dataset)
-        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
-        # methods. Therefore we need to make sure it also has the attribute.
-        elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
-            num_samples = eval_dataset.num_examples
-        else:
-            if has_length(dataloader):
-                num_samples = self.num_examples(dataloader)
-            else:  # both len(dataloader.dataset) and len(dataloader) fail
-                num_samples = observed_num_examples
-        if num_samples == 0 and observed_num_examples > 0:
-            num_samples = observed_num_examples
-
-        # Metrics!
-        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if all_losses is not None:
-            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
-
-    def prediction_loop_ort(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> PredictionOutput:
-        """
-        Prediction/evaluation loop, shared by `ORTTrainer.evaluate()` and `ORTTrainer.predict()`.
-
-        Works both with or without labels.
-        """
-        logger.info("[INFO] ONNX Runtime inference starts...")
-        self.ort_model = None
-
-        # Check if there are labels in the dataset
-        dummy_inputs = next(iter(dataloader))
-        has_labels = all(dummy_inputs.get(k) is not None for k in self.label_names)
-
-        # Export ONNX models
-        if self.onnx_model_path and (has_labels == self.exported_with_loss):
-            logger.info("[INFO] Inference with given ONNX model")
-            self.onnx_model_path = Path(self.onnx_model_path).as_posix()
-        else:
-            onnx_model_path = Path(self.args.output_dir)
-            logger.info("[INFO] Exporting the model to ONNX...")
-            if self.args.deepspeed and self.args.fp16:
-                export_device = "cuda"
-            else:
-                export_device = "cpu"
-
-            with_loss = has_labels and not self.label_smoother
-            # Only need to export decoders if the models have been exported before.
-            decoders_only = True if self.onnx_model_path else False
-            self._export(onnx_model_path, with_loss=with_loss, device=export_device, decoders_only=decoders_only)
-
-            self.exported_with_loss = with_loss
-            self.onnx_model_path = onnx_model_path.as_posix()
-            logger.info("[INFO] ONNX model is stored in:\n", self.onnx_model_path)
-
-        args = self.args
-        # Load ORT model
-        self.ort_model = ORTModelForSeq2SeqLM.from_pretrained(
-            model_id=self.onnx_model_path, provider="CUDAExecutionProvider"
-        )
-
-        if not has_length(dataloader):
-            raise ValueError("dataloader must implement a working __len__")
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        batch_size = dataloader.batch_size
-        num_examples = self.num_examples(dataloader)
-        logger.info(f"***** Running {description} *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Batch size = {batch_size}")
-        losses_host: torch.Tensor = None
-        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
-
-        world_size = max(1, args.world_size)
-
-        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
-        if not prediction_loss_only:
-            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
-            # a batch size to the sampler)
-            make_multiple_of = None
-            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
-                make_multiple_of = dataloader.sampler.batch_size
-            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        self.callback_handler.eval_dataloader = dataloader
-
-        for step, inputs in enumerate(dataloader):
-            loss, logits, labels = self.prediction_step_ort(
-                self.ort_model, inputs, prediction_loss_only, ignore_keys=ignore_keys
-            )
-            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
-
-            if loss is not None:
-                losses = loss.repeat(batch_size)
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
-            if logits is not None:
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-            if labels is not None:
-                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-            if inputs_decode is not None:
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
-                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-                if not prediction_loss_only:
-                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-        if not prediction_loss_only:
-            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-        eval_loss = eval_losses_gatherer.finalize()
-        preds = preds_gatherer.finalize() if not prediction_loss_only else None
-        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
-        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
-
-        if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if eval_loss is not None:
-            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
-
-    def prediction_step_ort(
-        self,
-        model: ORTModel,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-        **gen_kwargs,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (`ORTModel`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-            ignore_keys (`Lst[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-
-        Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss=None, generated
-            tokens and labels (each being optional).
-        """
 
-        if not self.args.predict_with_generate or prediction_loss_only:
-            return super().prediction_step_ort(
-                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
-            )
-
-        has_labels = "labels" in inputs
-        inputs = self._prepare_inputs(inputs)
-
-        # Priority (handled in generate):
-        # gen_kwargs > model.generation_config > default GenerationConfig()
-
-        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
-            gen_kwargs = self._gen_kwargs.copy()
-
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.model.config.max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
-        )
-        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
-        gen_kwargs["synced_gpus"] = (
-            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
-        )
-
-        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
-        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
         if (
-            "labels" in inputs
-            and "decoder_input_ids" in inputs
-            and inputs["labels"].shape == inputs["decoder_input_ids"].shape
+            gen_kwargs.get("max_length") is None
+            and gen_kwargs.get("max_new_tokens") is None
+            and self.args.generation_max_length is not None
         ):
-            inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
-        generated_tokens = self.model.generate(**inputs, **gen_kwargs)
-
-        # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
-        # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
-        # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
-        if self.model.generation_config._from_model_config:
-            self.model.generation_config._from_model_config = False
-        # Retrieves GenerationConfig from model.generation_config
-        gen_config = self.model.generation_config
-        # in case the batch is shorter than max length, the output should be padded
-        if generated_tokens.shape[-1] < gen_config.max_length:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
-        elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
-
-        with torch.no_grad():
-            with self.compute_loss_context_manager():
-                if self.label_smoother is not None:
-                    onnx_inputs = {k: v for k, v in inputs.items() if k != "labels"}
-                    outputs = model(**onnx_inputs)
-                else:
-                    outputs = model(**inputs)
-            if has_labels:
-                if self.label_smoother is not None:
-                    labels = inputs["labels"]
-                    # With label smoother, loss will be calculated out of box
-                    # So the outputs of InferenceSession need to be converted to tensor and sent to the same device
-                    loss = self.label_smoother(outputs, labels.to(outputs.logits.device)).mean().detach()
-                else:
-                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
-            else:
-                loss = None
-
-        if self.args.prediction_loss_only:
-            return loss, None, None
-
-        if has_labels:
-            labels = inputs["labels"]
-            if labels.shape[-1] < gen_config.max_length:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
-            elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
-        else:
-            labels = None
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
+        self._gen_kwargs = gen_kwargs
 
-        return loss, generated_tokens, labels
+        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
 
     def prediction_step(
         self,
@@ -657,31 +194,31 @@ def prediction_step(
         has_labels = "labels" in inputs
         inputs = self._prepare_inputs(inputs)
 
+        # XXX: adapt synced_gpus for fairscale as well
         # Priority (handled in generate):
-        # gen_kwargs > model.generation_config > default GenerationConfig()
-
+        # non-`None` gen_kwargs > model.generation_config > default GenerationConfig()
         if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
             gen_kwargs = self._gen_kwargs.copy()
+        if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None:
+            gen_kwargs.pop("num_beams")
+        if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None:
+            gen_kwargs.pop("max_length")
 
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.model.config.max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
-        )
         default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
         gen_kwargs["synced_gpus"] = (
             gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
         )
 
+        generation_inputs = inputs.copy()
         # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
         # (otherwise, it would continue generating from the padded `decoder_input_ids`)
         if (
-            "labels" in inputs
-            and "decoder_input_ids" in inputs
-            and inputs["labels"].shape == inputs["decoder_input_ids"].shape
+            "labels" in generation_inputs
+            and "decoder_input_ids" in generation_inputs
+            and generation_inputs["labels"].shape == generation_inputs["decoder_input_ids"].shape
         ):
-            inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
-        generated_tokens = self.model.generate(**inputs, **gen_kwargs)
+            generation_inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
+        generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs)
 
         # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
         # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
@@ -739,85 +276,3 @@ def _pad_tensors_to_max_len(self, tensor, max_length):
         )
         padded_tensor[:, : tensor.shape[-1]] = tensor
         return padded_tensor
-
-    def _export(
-        self,
-        save_dir: Union[str, Path],
-        model: Optional[PreTrainedModel] = None,
-        opset: Optional[int] = None,
-        device: str = "cpu",
-        with_loss: bool = True,
-        decoders_only: bool = False,
-        **kwargs,
-    ) -> None:
-        """
-        Load and export a sequence-to-sequence model to ONNX models(encoder and decoder(s)).
-
-        Args:
-            save_dir (`str` or `Path`):
-                The directory where the ONNX models(encoder, decoder...) should be saved, default to
-                `transformers.file_utils.default_cache_path`, which is the cache dir for transformers.
-            device (`str`, *optional*, defaults to `cpu`):
-                The device on which the ONNX model will be exported. Either `cpu` or `cuda`.
-            with_loss (`bool`, defaults to `True`):
-                Whether to export ONNX model with the loss in outputs.
-            decoders_only (`bool`, defaults to `False`):
-                Whether to just export decoder models.
-        """
-        if model is None:
-            if not (self.args.fp16 and self.args.deepspeed):
-                # Taking CPU to export the model
-                self.model.to("cpu")
-            model = unwrap_model(self.model)
-
-        onnx_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=model, exporter="onnx", task=self.feature
-        )
-        onnx_config = onnx_config_constructor(model.config)
-
-        opset = onnx_config.DEFAULT_ONNX_OPSET if opset is None else opset
-
-        encoder = model.get_encoder()
-
-        onnx_config_encoder = onnx_config.with_behavior("encoder")
-        onnx_config_decoder = onnx_config.with_behavior("decoder", use_past=False)
-        onnx_config_decoder_with_past = onnx_config.with_behavior("decoder", use_past=True)
-
-        if with_loss:
-            # Add `loss` to the ONNX config of decoders
-            onnx_config_decoder = wrap_onnx_config_for_loss(onnx_config_decoder)
-            onnx_config_decoder_with_past = wrap_onnx_config_for_loss(onnx_config_decoder_with_past)
-            opset = max(opset, 12)  # Operators like `nll_loss`are added for opset>=12
-
-        # Export the encoder
-        if not decoders_only:
-            _ = export(
-                model=encoder,
-                config=onnx_config_encoder,
-                opset=opset,
-                output=Path(save_dir).joinpath(ONNX_ENCODER_NAME),
-                device=device,
-            )
-        # Export the decoder without the past key values
-        export(
-            model=model,
-            config=onnx_config_decoder,
-            opset=opset,
-            output=Path(save_dir).joinpath(ONNX_DECODER_NAME),
-            device=device,
-        )
-
-        # Export the decoder with the past key values
-        use_cache = kwargs.get("use_cache", True)
-        if use_cache:
-            export(
-                model=model,
-                config=onnx_config_decoder_with_past,
-                opset=opset,
-                output=Path(save_dir).joinpath(ONNX_DECODER_WITH_PAST_NAME),
-                device=device,
-            )
-
-        # TODO: Need to use merged decoder to reduce the use of GPU memory
-
-        model.config.save_pretrained(save_dir)
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 88e5fc9bc2..a0cb7c8e98 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -34,6 +34,7 @@
 from transformers.utils import (
     ExplicitEnum,
     get_full_repo_name,
+    is_accelerate_available,
     is_safetensors_available,
     is_torch_available,
     is_torch_bf16_cpu_available,
@@ -41,6 +42,7 @@
     is_torch_tf32_available,
     logging,
 )
+from transformers.utils.generic import strtobool
 
 
 if is_torch_available():
@@ -137,8 +139,9 @@ def __post_init__(self):
         if self.load_best_model_at_end:
             if self.evaluation_strategy != self.save_strategy:
                 raise ValueError(
-                    "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
-                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                    "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
+                    "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps "
+                    f"{self.save_steps} and eval_steps {self.eval_steps}."
                 )
             if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
                 if self.eval_steps < 1 or self.save_steps < 1:
@@ -189,14 +192,15 @@ def __post_init__(self):
                 self.half_precision_backend = self.fp16_backend
 
             if self.bf16 or self.bf16_full_eval:
-                if self.no_cuda and not is_torch_bf16_cpu_available():
+                if self.use_cpu and not is_torch_bf16_cpu_available():
                     # cpu
                     raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
-                elif not self.no_cuda and torch.cuda.is_available() and not is_torch_bf16_gpu_available():
-                    # gpu
-                    raise ValueError(
-                        "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
-                    )
+                elif not self.use_cpu:
+                    if torch.cuda.is_available() and not is_torch_bf16_gpu_available():
+                        # gpu
+                        raise ValueError(
+                            "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
+                        )
 
         if self.fp16 and self.bf16:
             raise ValueError("At most one of fp16 and bf16 can be True, but not both")
@@ -286,6 +290,7 @@ def __post_init__(self):
                         " otherwise."
                     )
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
                     "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
@@ -294,13 +299,24 @@ def __post_init__(self):
             if self.tf32:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
                 else:
                     raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7")
             else:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = False
+                    torch.backends.cudnn.allow_tf32 = False
                 # no need to assert on else
 
+        # if training args is specified, it will override the one specified in the accelerate config
+        if self.half_precision_backend != "apex":
+            mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            if self.fp16:
+                mixed_precision_dtype = "fp16"
+            elif self.bf16:
+                mixed_precision_dtype = "bf16"
+            os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
+
         if self.report_to is None:
             logger.info(
                 "The default value for the training argument `--report_to` will change in v5 (from all installed "
@@ -342,44 +358,44 @@ def __post_init__(self):
             self.fsdp_config = {}
 
         if isinstance(self.fsdp_config, str):
+            if len(self.fsdp) == 0:
+                warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.")
             with io.open(self.fsdp_config, "r", encoding="utf-8") as f:
                 self.fsdp_config = json.load(f)
+                for k in list(self.fsdp_config.keys()):
+                    if k.startswith("fsdp_"):
+                        v = self.fsdp_config.pop(k)
+                        self.fsdp_config[k[5:]] = v
 
         if self.fsdp_min_num_params > 0:
             warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning)
 
-        self.fsdp_config["fsdp_min_num_params"] = max(
-            self.fsdp_config.get("fsdp_min_num_params", 0), self.fsdp_min_num_params
-        )
+        self.fsdp_config["min_num_params"] = max(self.fsdp_config.get("min_num_params", 0), self.fsdp_min_num_params)
 
-        # if fsdp_config["fsdp_transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object
-        if isinstance(self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None), str):
-            self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = [
-                self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
-            ]
+        # if fsdp_config["transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object
+        if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str):
+            self.fsdp_config["transformer_layer_cls_to_wrap"] = [self.fsdp_config["transformer_layer_cls_to_wrap"]]
 
         if self.fsdp_transformer_layer_cls_to_wrap is not None:
             warnings.warn(
                 "using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning
             )
-            self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = self.fsdp_config.get(
-                "fsdp_transformer_layer_cls_to_wrap", []
+            self.fsdp_config["transformer_layer_cls_to_wrap"] = self.fsdp_config.get(
+                "transformer_layer_cls_to_wrap", []
             ) + [self.fsdp_transformer_layer_cls_to_wrap]
 
-        if len(self.fsdp) == 0 and self.fsdp_config["fsdp_min_num_params"] > 0:
-            warnings.warn("`--fsdp_min_num_params` is useful only when `--fsdp` is specified.")
+        if len(self.fsdp) == 0 and self.fsdp_config["min_num_params"] > 0:
+            warnings.warn("`min_num_params` is useful only when `--fsdp` is specified.")
 
-        if len(self.fsdp) == 0 and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
-            warnings.warn("`--fsdp_transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
+        if len(self.fsdp) == 0 and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
+            warnings.warn("`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
 
         if (
             len(self.fsdp) > 0
-            and self.fsdp_config["fsdp_min_num_params"] > 0
-            and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None
+            and self.fsdp_config["min_num_params"] > 0
+            and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None
         ):
-            raise ValueError(
-                "`--fsdp_min_num_params` and `--fsdp_transformer_layer_cls_to_wrap` are mutually exclusive."
-            )
+            raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.")
         self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
         self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)
         if self.fsdp_config["xla"]:
@@ -405,23 +421,29 @@ def __post_init__(self):
                 FSDP_SHARDING_STRATEGY,
             )
 
+            prefix = "FSDP_"
             for fsdp_option in self.fsdp:
                 if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
                     # set environment variable for FSDP sharding strategy
-                    os.environ["FSDP_SHARDING_STRATEGY"] = str(FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1)
+                    os.environ[f"{prefix}SHARDING_STRATEGY"] = str(
+                        FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1
+                    )
                 elif fsdp_option == FSDPOption.OFFLOAD:
-                    os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
+                    os.environ[f"{prefix}OFFLOAD_PARAMS"] = "true"
                 elif fsdp_option == FSDPOption.AUTO_WRAP:
-                    if self.fsdp_config["fsdp_min_num_params"] > 0:
-                        os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"])
-                        os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
-                    elif self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
-                        os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join(
-                            self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
+                    os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
+                    if self.fsdp_config["min_num_params"] > 0:
+                        os.environ[f"{prefix}MIN_NUM_PARAMS"] = str(self.fsdp_config["min_num_params"])
+                        os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
+                    elif self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
+                        os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"] = ",".join(
+                            self.fsdp_config["transformer_layer_cls_to_wrap"]
                         )
-                        os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
             prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH")
-            os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper()
+            os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
+            os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefect", "false")
+            os.environ[f"{prefix}SYNC_MODULE_STATES"] = self.fsdp_config.get("sync_module_states", "true")
+            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "false")
 
         if self.tpu_metrics_debug:
             warnings.warn(
@@ -444,7 +466,9 @@ def __post_init__(self):
         if self.deepspeed:
             # - must be run very last in arg parsing, since it will use a lot of these settings.
             # - must be run before the model is created.
-            from transformers.deepspeed import HfTrainerDeepSpeedConfig
+            if not is_accelerate_available():
+                raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
+            from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 
             # will be used later by the Trainer
             # note: leave self.deepspeed unmodified in case a user relies on it not to be modified)
@@ -456,6 +480,14 @@ def __post_init__(self):
 
             os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
             self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
+        elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")):
+            # Accelerate DeepSpeed Plugin
+            from accelerate.utils import DeepSpeedPlugin
+
+            self.deepspeed_plugin = DeepSpeedPlugin()
+            mixed_precision = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            self.deepspeed_plugin.set_mixed_precision(mixed_precision)
+            self.deepspeed_plugin.set_deepspeed_weakref()
 
         if self.push_to_hub_token is not None:
             warnings.warn(
diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
index 62f7efc817..7266ba224a 100644
--- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
+++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
@@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
@@ -65,7 +65,7 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.15.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
@@ -76,4 +76,5 @@ COPY . /workspace/optimum
 RUN pip install /workspace/optimum[tests]
 
 ENV TEST_LEVEL=1
-CMD RUN_SLOW=1 pytest -v -rs onnxruntime/nightly_test_trainer.py --durations=0
\ No newline at end of file
+CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_trainer.py --durations=0
+CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_examples.py --durations=0
\ No newline at end of file
diff --git a/tests/onnxruntime/training/nightly_test_examples.py b/tests/onnxruntime/training/nightly_test_examples.py
new file mode 100644
index 0000000000..a16913a097
--- /dev/null
+++ b/tests/onnxruntime/training/nightly_test_examples.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test ONNX Runtime Training Examples in Optimum."""
+
+import subprocess
+import unittest
+
+import pytest
+from transformers.testing_utils import slow
+
+
+@slow
+class ORTTrainerExampleTest(unittest.TestCase):
+    def test_text_classification(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_glue.py"
+            " --model_name_or_path distilbert-base-uncased"
+            " --task_name mnli"
+            " --max_seq_length 64"
+            " --learning_rate 3e-6"
+            " --do_train"
+            " --output_dir /tmp/distilbert"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 8"
+            " --fp16 --optim adamw_ort_fused"
+            " --max_train_samples 20",
+            shell=True,
+            check=True,
+        )
+
+    def test_token_classification(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/token-classification/run_ner.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_ner.py"
+            " --model_name_or_path bert-base-cased"
+            " --dataset_name conll2003"
+            " --do_train"
+            " --output_dir /tmp/bert"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 8"
+            " --fp16 --optim adamw_ort_fused"
+            " --max_train_samples 20",
+            shell=True,
+            check=True,
+        )
+
+    def test_translation(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/translation/run_translation.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_translation.py"
+            " --model_name_or_path t5-small"
+            " --dataset_name wmt16"
+            " --dataset_config ro-en"
+            " --label_smoothing 0.1"
+            " --predict_with_generate"
+            " --source_lang en"
+            " --target_lang ro"
+            " --do_train"
+            " --max_train_samples 30"
+            " --output_dir /tmp/t5"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 2"
+            " --fp16 --optim adamw_ort_fused",
+            shell=True,
+            check=True,
+        )
+
+    @pytest.mark.skip(reason="skip for now")
+    def test_summarization(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/summarization/run_summarization.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_summarization.py"
+            " --model_name_or_path t5-small"
+            " --do_train"
+            " --do_eval"
+            " --dataset_name cnn_dailymail"
+            ' --dataset_config "3.0.0"'
+            ' --source_prefix "summarize: "'
+            " --predict_with_generate"
+            " --max_train_samples 30"
+            " --output_dir /tmp/t5"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 2"
+            " --per_device_eval_batch_size 2"
+            " --fp16 --optim adamw_ort_fused",
+            shell=True,
+            check=True,
+        )
+
+    # TODO: Update the example and add the test
+    def test_stable_diffusion_txt2img(self):
+        pass
+
+    @pytest.mark.skip(reason="skip for now")
+    def test_question_answering(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_qa.py"
+            " --model_name_or_path bert-base-uncased"
+            " --do_train"
+            " --do_eval"
+            " --dataset_name squad"
+            " --max_train_samples 30"
+            " --output_dir /tmp/bert"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 2"
+            " --per_device_eval_batch_size 2"
+            " --fp16 --optim adamw_ort_fused",
+            shell=True,
+            check=True,
+        )
+
+    @pytest.mark.skip(reason="skip for now")
+    def test_language_modeling(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_clm.py"
+            " --model_name_or_path gpt2"
+            " --do_train"
+            " --do_eval"
+            " --dataset_name wikitext"
+            " --dataset_config_name wikitext-2-raw-v1"
+            " --max_train_samples 30"
+            " --output_dir /tmp/gpt2"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 2"
+            " --per_device_eval_batch_size 2"
+            " --fp16 --optim adamw_ort_fused",
+            shell=True,
+            check=True,
+        )
+
+    @pytest.mark.skip(reason="skip for now")
+    def test_image_classification(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/image-classification/run_image_classification.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_image_classification.py"
+            " --model_name_or_path google/vit-base-patch16-224-in21k"
+            " --do_train"
+            " --do_eval"
+            " --dataset_name beans"
+            " --max_train_samples 30"
+            " --output_dir /tmp/vit"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 2"
+            " --per_device_eval_batch_size 2"
+            " --fp16 --optim adamw_ort_fused",
+            shell=True,
+            check=True,
+        )
diff --git a/tests/onnxruntime/nightly_test_trainer.py b/tests/onnxruntime/training/nightly_test_trainer.py
similarity index 54%
rename from tests/onnxruntime/nightly_test_trainer.py
rename to tests/onnxruntime/training/nightly_test_trainer.py
index 2eb3ca433f..e24ee30617 100644
--- a/tests/onnxruntime/nightly_test_trainer.py
+++ b/tests/onnxruntime/training/nightly_test_trainer.py
@@ -12,11 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Test ONNX Runtime Training ORTTrainer in Optimum."""
 
 import gc
+import os
 import random
-import subprocess
-import sys
 import tempfile
 import unittest
 from itertools import chain
@@ -25,7 +25,6 @@
 
 import nltk
 import numpy as np
-import pytest
 from datasets import load_dataset
 from evaluate import load
 from transformers import (
@@ -35,12 +34,16 @@
     AutoModelForTokenClassification,
     AutoTokenizer,
     DataCollatorForSeq2Seq,
-    DataCollatorForTokenClassification,
     DataCollatorWithPadding,
     default_data_collator,
     is_torch_available,
 )
-from transformers.testing_utils import require_deepspeed, require_torch, slow
+from transformers.testing_utils import (
+    mockenv_context,
+    require_deepspeed,
+    require_torch,
+    slow,
+)
 from transformers.training_args import OptimizerNames
 
 
@@ -75,11 +78,11 @@
         "data_collator": default_data_collator,
         "data_collator_class": DataCollatorWithPadding,
     },
-    "token-classification": {
-        "dataset": ["conll2003"],
-        "metric": ["seqeval"],
-        "data_collator_class": DataCollatorForTokenClassification,
-    },
+    # "token-classification": {
+    #     "dataset": ["conll2003"],
+    #     "metric": ["seqeval"],
+    #     "data_collator_class": DataCollatorForTokenClassification,
+    # },
 }
 
 _DECODER_TASKS_DATASETS_CONFIGS = {
@@ -88,11 +91,6 @@
         "metric": ["accuracy"],
         "data_collator": default_data_collator,
     },
-    "text-generation-with-past": {
-        "dataset": ["wikitext", "wikitext-2-raw-v1"],
-        "metric": ["accuracy"],
-        "data_collator": default_data_collator,
-    },
 }
 
 _SEQ2SEQ_TASKS_DATASETS_CONFIGS = {
@@ -101,30 +99,37 @@
         "metric": ["rouge"],
         "data_collator_class": DataCollatorForSeq2Seq,
     },
-    "text2text-generation-with-past": {
-        "dataset": ["xsum"],
-        "metric": ["rouge"],
-        "data_collator_class": DataCollatorForSeq2Seq,
-    },
 }
 
+# List supported ORT optimizers to test
+optim_test_params = []
+if is_torch_available():
+    default_adam_kwargs = {
+        "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2),
+        "eps": ORTTrainingArguments.adam_epsilon,
+        "lr": ORTTrainingArguments.learning_rate,
+    }
 
-def _get_models_to_test(model_list, task_list, both_inf_backend=False, excluded: Optional[List[str]] = None):
+    optim_test_params = [
+        (
+            ORTOptimizerNames.ADAMW_ORT_FUSED,
+            onnxruntime.training.optim.FusedAdam,
+            default_adam_kwargs,
+        ),
+    ]
+
+# default torch.distributed port
+DEFAULT_MASTER_PORT = "10999"
+
+
+def _get_models_to_test(model_list, task_list, excluded: Optional[List[str]] = None):
     models_to_test = []
 
     for name, model_name in model_list:
-        for feature, data_metric_config in task_list.items():
-            if excluded and (name in excluded or feature in excluded):
+        for task, data_metric_config in task_list.items():
+            if excluded and (name in excluded or task in excluded):
                 continue
-            if both_inf_backend:
-                models_to_test.append(
-                    (f"{name}_{feature}", model_name, feature, data_metric_config, True)
-                )  # inference_with_ort=True
-                models_to_test.append(
-                    (f"{name}_{feature}", model_name, feature, data_metric_config, False)
-                )  # inference_with_ort=False
-            else:
-                models_to_test.append((f"{name}_{feature}", model_name, feature, data_metric_config))
+            models_to_test.append((f"{name}_{task}", model_name, task, data_metric_config))
 
     return sorted(models_to_test)
 
@@ -151,17 +156,39 @@ def _get_data_collator(data_metric_config, tokenizer=None, model=None, training_
     return data_collator
 
 
-def get_ort_training_args(feature, **kwargs):
-    if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS:
+def get_ort_training_args(task, **kwargs):
+    if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS:
         training_args = ORTTrainingArguments(**kwargs)
-    elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
+    elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
         training_args = ORTSeq2SeqTrainingArguments(**kwargs)
     return training_args
 
 
+def get_master_port(real_launcher=False):
+    """
+    When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
+    the issue is that once the port is tied it can't be used anywhere else outside of this process,
+    since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
+    able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
+
+    This function will give the right port in the right context. For real launcher it'll give the
+    base port, for emulated launcher it'll give the base port + 1. In both cases a string is
+    returned.
+
+    Args:
+        `real_launcher`: whether a real launcher is going to be used, or the emulated one
+
+    """
+
+    master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    if not real_launcher:
+        master_port_base = str(int(master_port_base) + 1)
+    return master_port_base
+
+
 def get_ort_trainer(
     model_name,
-    feature,
+    task,
     data_metric_config,
     training_args,
     max_seq_length=None,
@@ -170,7 +197,7 @@ def get_ort_trainer(
     max_test_samples=None,
     **kwargs,
 ):
-    training_kwargs = load_and_prepare(feature)(
+    training_kwargs = load_and_prepare(task)(
         model_name,
         data_metric_config,
         max_seq_length,
@@ -185,26 +212,25 @@ def get_ort_trainer(
     if getattr(training_args, "predict_with_generate", False) is not True:
         training_kwargs.pop("compute_metrics", None)
 
-    if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS:
-        trainer = ORTTrainer(feature=feature, args=training_args, **training_kwargs)
-    elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
-        trainer = ORTSeq2SeqTrainer(feature=feature, args=training_args, **training_kwargs)
+    if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS:
+        trainer = ORTTrainer(args=training_args, **training_kwargs)
+    elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
+        trainer = ORTSeq2SeqTrainer(args=training_args, **training_kwargs)
     else:
         raise
 
     return trainer, test_dataset
 
 
-def load_and_prepare(feature):
+def load_and_prepare(task):
     preprocess_mapping = {
         "text-classification": load_and_prepare_glue,
         "token-classification": load_and_prepare_ner,
         "text-generation": load_and_prepare_clm,
         "text-generation-with-past": load_and_prepare_clm,
         "text2text-generation": load_and_prepare_xsum,
-        "text2text-generation-with-past": load_and_prepare_xsum,
     }
-    return preprocess_mapping[feature]
+    return preprocess_mapping[task]
 
 
 def load_and_prepare_glue(model_name, data_metric_config, max_seq_length, padding="max_length", **kwargs):
@@ -520,212 +546,140 @@ class ORTTrainerIntegrationTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
         args = ORTTrainingArguments("..")
+        master_port = get_master_port(real_launcher=False)
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
         self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
+        self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2)
+        self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2)
 
         self.max_seq_length = 64
-        self.max_train_samples = 50
-        self.max_valid_samples = 20
-        self.max_test_samples = 10
+        self.max_train_samples = 10
+        self.max_valid_samples = 5
+        self.max_test_samples = 5
 
         self.warmup_steps = 10
         self.weight_decay = 0.01
 
     @parameterized.expand(
-        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)  # Skip test for OOM bug
-        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True),
-        skip_on_empty=True,
-    )
-    def test_trainer_fp32(self, test_name, model_name, feature, data_metric_config, inference_with_ort):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=inference_with_ort)
-            trainer.predict(test_dataset, inference_with_ort=inference_with_ort)
-            gc.collect()
-
-    @parameterized.expand(
-        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)  # Skip test for OOM bug
-        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True),
+        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp32_with_label_smoothing(
-        self, test_name, model_name, feature, data_metric_config, inference_with_ort
-    ):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                label_smoothing_factor=0.1,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=inference_with_ort)
-            trainer.predict(test_dataset, inference_with_ort=inference_with_ort)
-            gc.collect()
+    def test_trainer_fp32(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
     @slow
     @parameterized.expand(
         _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)  # Skip test for OOM bug
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
         + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_pt_inference(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate()
-            trainer.predict(test_dataset)
-            gc.collect()
+    def test_trainer_fp32_with_label_smoothing(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    label_smoothing_factor=0.1,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
     @slow
     @parameterized.expand(
         _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-        # Exclude "with-past" tests as they fail for ORT inference after the mixed-precision training
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, excluded=["text-generation-with-past"])  # Skip test for OOM bug
-        + _get_models_to_test(
-            _SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, excluded=["text2text-generation-with-past"]
-        ),
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ort_inference(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=True)
-            trainer.predict(test_dataset, inference_with_ort=True)
-            gc.collect()
-
-    # Skip this test as a large amount of ops don't support bf16 yet.
-    # @unittest.skip("Skip BF16 test.")
-    # @slow
-    # @require_torch_bf16_gpu
-    # @parameterized.expand(
-    #     _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-    #     + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
-    #     + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-    #     skip_on_empty=True,
-    # )
-    # def test_trainer_bf16(self, test_name, model_name, feature, data_metric_config):
-    #     with tempfile.TemporaryDirectory() as tmp_dir:
-    #         training_args = get_ort_training_args(
-    #             feature=feature,
-    #             output_dir=tmp_dir,
-    #             num_train_epochs=self.n_epochs,
-    #             per_device_train_batch_size=self.per_device_train_batch_size,
-    #             per_device_eval_batch_size=self.per_device_eval_batch_size,
-    #             warmup_steps=self.warmup_steps,
-    #             weight_decay=self.weight_decay,
-    #             logging_dir=tmp_dir,
-    #             bf16=True,
-    #         )
-
-    #         trainer, test_dataset = get_ort_trainer(
-    #             model_name,
-    #             feature,
-    #             data_metric_config,
-    #             training_args,
-    #             max_seq_length=self.max_seq_length,
-    #             max_train_samples=self.max_train_samples,
-    #             max_valid_samples=self.max_valid_samples,
-    #             max_test_samples=self.max_test_samples,
-    #         )
-
-    #         trainer.train()
-    #         trainer.save_model()
-    #         trainer.evaluate()
-    #         trainer.predict(test_dataset)
-    #         gc.collect()
+    def test_trainer_fp16(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
 
 @slow
@@ -734,14 +688,22 @@ class ORTTrainerIntegrationDeepSpeedTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
         args = ORTTrainingArguments("..")
+        master_port = get_master_port(real_launcher=False)
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
         self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
+        self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2)
+        self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2)
 
         self.max_seq_length = 64
-        self.max_train_samples = 30
-        self.max_valid_samples = 10
-        self.max_test_samples = 10
+        self.max_train_samples = 10
+        self.max_valid_samples = 5
+        self.max_test_samples = 5
 
         self.warmup_steps = 10
         self.weight_decay = 0.01
@@ -749,126 +711,80 @@ def setUp(self):
     @parameterized.expand(
         random.sample(
             _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-            # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+            + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
             + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-            1,
+            1,  # only test one
         ),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ds_stage1(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-                deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json",
-            )
-
-            trainer, _ = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
+    def test_trainer_fp16_ds_stage1(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                    deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json",
+                )
+
+                trainer, _ = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                gc.collect()
 
     @parameterized.expand(
         random.sample(
             _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-            # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+            + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
             + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
             1,
         ),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ds_stage2(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-                deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json",
-            )
-
-            trainer, _ = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
-
-
-@slow
-@pytest.mark.skip(reason="skip for now, server socket error")
-class ORTTrainerIntegrationDDPTest(unittest.TestCase):
-    def test_trainer_ddp_glue(self):
-        subprocess.run(
-            "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./",
-            shell=True,
-        )
-
-        subprocess.run(
-            f"{sys.executable} -m torch.distributed.launch"
-            " --nproc_per_node=1"
-            " run_glue.py"
-            " --model_name_or_path distilbert-base-uncased"
-            " --task_name mnli"
-            " --max_seq_length 128"
-            " --learning_rate 3e-6"
-            " --do_train"
-            " --output_dir /tmp/distilbert"
-            " --overwrite_output_dir"
-            " --max_steps 200"
-            " --logging_steps 20"
-            " --per_device_train_batch_size 32"
-            " --fp16 --optim adamw_ort_fused"
-            " --max_train_samples 500",
-            shell=True,
-            check=True,
-        )
-
-
-# List supported ORT optimizers to test
-optim_test_params = []
-if is_torch_available():
-    default_adam_kwargs = {
-        "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2),
-        "eps": ORTTrainingArguments.adam_epsilon,
-        "lr": ORTTrainingArguments.learning_rate,
-    }
-
-    optim_test_params = [
-        (
-            ORTOptimizerNames.ADAMW_ORT_FUSED,
-            onnxruntime.training.optim.FusedAdam,
-            default_adam_kwargs,
-        ),
-    ]
+    def test_trainer_fp16_ds_stage2(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                    deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json",
+                )
+
+                trainer, _ = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                gc.collect()
 
 
 @slow
@@ -876,21 +792,6 @@ def test_trainer_ddp_glue(self):
 class ORTTrainerOptimizerChoiceTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        args = ORTTrainingArguments("..")
-        self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
-
-        self.max_seq_length = 64
-        self.max_train_samples = 50
-        self.max_valid_samples = 20
-        self.max_test_samples = 10
-
-        self.warmup_steps = 10
-        self.weight_decay = 0.01
-
-        self.model_name = "bert-base-cased"
-        self.feature = "text-classification"
 
     def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expected_cls):
         args = ORTTrainingArguments(optim=optim, output_dir="None")
@@ -903,37 +804,6 @@ def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expect
             actual_v = optim_kwargs[p]
             self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.")
 
-    @parameterized.expand(optim_test_params, skip_on_empty=True)
-    def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs):
-        # exercises all the valid --optim options
-        self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = ORTTrainingArguments(
-                optim=name,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, _ = get_ort_trainer(
-                self.model_name,
-                self.feature,
-                _ENCODER_TASKS_DATASETS_CONFIGS[self.feature],
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
-
     def test_ort_fused_adam(self):
         # Pretend that onnxruntime-training is installed and mock onnxruntime.training.optim.FusedAdam exists.
         # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the
@@ -951,56 +821,3 @@ def test_ort_fused_adam(self):
                 default_adam_kwargs,
                 mock.optimizers.FusedAdam,
             )
-
-
-class ORTSeq2SeqTrainerSpecificIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        args = ORTTrainingArguments("..")
-        self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
-
-        self.max_seq_length = 32
-        self.max_train_samples = 10
-        self.max_valid_samples = 10
-        self.max_test_samples = 10
-
-        self.warmup_steps = 10
-        self.weight_decay = 0.01
-
-    @parameterized.expand(
-        _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-        skip_on_empty=True,
-    )
-    def test_predict_with_generate_ort(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                evaluation_strategy="epoch",
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                label_smoothing_factor=0.1,
-                predict_with_generate=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.evaluate(inference_with_ort=True)
-            trainer.predict(test_dataset, inference_with_ort=True)
-            gc.collect()