From 8044e5f60332940217576a067d99b454a83c5457 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 8 Nov 2023 00:42:55 -0800
Subject: [PATCH 01/13] SDXL: Update demo with dynamic shape serving with CUDA
 EP (#18340)

Update the SDXL demo with dynamic shape serving with CUDA EP.
---
 .../stable_diffusion/demo_txt2img_xl.py       | 103 ++++++++++++++----
 .../models/stable_diffusion/demo_utils.py     |   2 +-
 .../stable_diffusion/diffusion_models.py      |  23 +++-
 .../pipeline_stable_diffusion.py              |   9 +-
 4 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
index 16e776a08282c..0b529875a2fe7 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
@@ -29,17 +29,7 @@
 from pipeline_txt2img_xl import Txt2ImgXLPipeline
 
 
-def run_demo():
-    """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image."""
-
-    args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo")
-
-    prompt, negative_prompt = repeat_prompt(args)
-
-    # Recommend image size as one of those used in training (see Appendix I in https://arxiv.org/pdf/2307.01952.pdf).
-    image_height = args.height
-    image_width = args.width
-
+def load_pipelines(args, batch_size):
     # Register TensorRT plugins
     engine_type = get_engine_type(args.engine)
     if engine_type == EngineType.TRT:
@@ -49,19 +39,18 @@ def run_demo():
 
     max_batch_size = 16
     if (engine_type in [EngineType.ORT_TRT, EngineType.TRT]) and (
-        args.build_dynamic_shape or image_height > 512 or image_width > 512
+        args.build_dynamic_shape or args.height > 512 or args.width > 512
     ):
         max_batch_size = 4
 
-    batch_size = len(prompt)
     if batch_size > max_batch_size:
         raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.")
 
     # No VAE decoder in base when it outputs latent instead of image.
-    base_info = PipelineInfo(args.version, use_vae=False)
+    base_info = PipelineInfo(args.version, use_vae=False, min_image_size=640, max_image_size=1536)
     base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size)
 
-    refiner_info = PipelineInfo(args.version, is_refiner=True)
+    refiner_info = PipelineInfo(args.version, is_refiner=True, min_image_size=640, max_image_size=1536)
     refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size)
 
     if engine_type == EngineType.TRT:
@@ -77,7 +66,13 @@ def run_demo():
             enable_vae_slicing = True
         if enable_vae_slicing:
             refiner.backend.enable_vae_slicing()
+    return base, refiner
 
+
+def run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False):
+    image_height = args.height
+    image_width = args.width
+    batch_size = len(prompt)
     base.load_resources(image_height, image_width, batch_size)
     refiner.load_resources(image_height, image_width, batch_size)
 
@@ -112,10 +107,14 @@ def run_base_and_refiner(warmup=False):
         # inference once to get cuda graph
         _, _ = run_base_and_refiner(warmup=True)
 
-    print("[I] Warming up ..")
+    if args.num_warmup_runs > 0:
+        print("[I] Warming up ..")
     for _ in range(args.num_warmup_runs):
         _, _ = run_base_and_refiner(warmup=True)
 
+    if is_warm_up:
+        return
+
     print("[I] Running StableDiffusion XL pipeline")
     if args.nvtx_profile:
         cudart.cudaProfilerStart()
@@ -123,14 +122,80 @@ def run_base_and_refiner(warmup=False):
     if args.nvtx_profile:
         cudart.cudaProfilerStop()
 
-    base.teardown()
-
     print("|------------|--------------|")
     print("| {:^10} | {:>9.2f} ms |".format("e2e", latency))
     print("|------------|--------------|")
+
+
+def run_demo(args):
+    """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image."""
+
+    prompt, negative_prompt = repeat_prompt(args)
+    batch_size = len(prompt)
+    base, refiner = load_pipelines(args, batch_size)
+    run_pipelines(args, base, refiner, prompt, negative_prompt)
+    base.teardown()
+    refiner.teardown()
+
+
+def run_dynamic_shape_demo(args):
+    """Run demo of generating images with different size with list of prompts with ORT CUDA provider."""
+    args.engine = "ORT_CUDA"
+    args.scheduler = "UniPC"
+    args.denoising_steps = 8
+    args.disable_cuda_graph = True
+
+    batch_size = args.repeat_prompt
+    base, refiner = load_pipelines(args, batch_size)
+
+    image_sizes = [
+        (1024, 1024),
+        (1152, 896),
+        (896, 1152),
+        (1216, 832),
+        (832, 1216),
+        (1344, 768),
+        (768, 1344),
+        (1536, 640),
+        (640, 1536),
+    ]
+
+    # Warm up the pipelines. This only need once before serving.
+    args.prompt = ["warm up"]
+    args.num_warmup_runs = 3
+    prompt, negative_prompt = repeat_prompt(args)
+    for height, width in image_sizes:
+        args.height = height
+        args.width = width
+        print(f"\nWarm up pipelines for Batch_size={batch_size}, Height={height}, Width={width}")
+        run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True)
+
+    # Run pipeline on a list of prompts.
+    prompts = [
+        "starry night over Golden Gate Bridge by van gogh",
+        "little cute gremlin sitting on a bed, cinematic",
+    ]
+    args.num_warmup_runs = 0
+    for example_prompt in prompts:
+        args.prompt = [example_prompt]
+        prompt, negative_prompt = repeat_prompt(args)
+
+        for height, width in image_sizes:
+            args.height = height
+            args.width = width
+            print(f"\nBatch_size={batch_size}, Height={height}, Width={width}, Prompt={example_prompt}")
+            run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)
+
+    base.teardown()
     refiner.teardown()
 
 
 if __name__ == "__main__":
     coloredlogs.install(fmt="%(funcName)20s: %(message)s")
-    run_demo()
+
+    args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo")
+    no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0]
+    if no_prompt:
+        run_dynamic_shape_demo(args)
+    else:
+        run_demo(args)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
index e65efd2c53839..d0e4e3adefbc3 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
@@ -78,7 +78,7 @@ def parse_arguments(is_xl: bool, description: str):
         help="Root Directory to store torch or ONNX models, built engines and output images etc.",
     )
 
-    parser.add_argument("prompt", nargs="+", help="Text prompt(s) to guide image generation.")
+    parser.add_argument("prompt", nargs="*", default=[""], help="Text prompt(s) to guide image generation.")
 
     parser.add_argument(
         "--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation."
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
index 4a2e9eb3443da..d93ca8dba7fa0 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -82,12 +82,21 @@ def infer_shapes(self):
 
 
 class PipelineInfo:
-    def __init__(self, version: str, is_inpaint: bool = False, is_refiner: bool = False, use_vae=False):
+    def __init__(
+        self,
+        version: str,
+        is_inpaint: bool = False,
+        is_refiner: bool = False,
+        use_vae=False,
+        min_image_size=256,
+        max_image_size=1024,
+    ):
         self.version = version
         self._is_inpaint = is_inpaint
         self._is_refiner = is_refiner
         self._use_vae = use_vae
-
+        self._min_image_size = min_image_size
+        self._max_image_size = max_image_size
         if is_refiner:
             assert self.is_xl()
 
@@ -187,6 +196,12 @@ def unet_embedding_dim(self):
         else:
             raise ValueError(f"Invalid version {self.version}")
 
+    def min_image_size(self):
+        return self._min_image_size
+
+    def max_image_size(self):
+        return self._max_image_size
+
 
 class BaseModel:
     def __init__(
@@ -209,8 +224,8 @@ def __init__(
 
         self.min_batch = 1
         self.max_batch = max_batch_size
-        self.min_image_shape = 256  # min image resolution: 256x256
-        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_image_shape = pipeline_info.min_image_size()
+        self.max_image_shape = pipeline_info.max_image_size()
         self.min_latent_shape = self.min_image_shape // 8
         self.max_latent_shape = self.max_image_shape // 8
 
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
index e28db2b77105a..e34fab1218b21 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
@@ -163,12 +163,9 @@ def is_backend_tensorrt(self):
         return self.engine_type == EngineType.TRT
 
     def set_denoising_steps(self, denoising_steps: int):
-        if self.denoising_steps != denoising_steps:
-            assert self.denoising_steps is None  # TODO(tianleiwu): support changing steps in different runs
-            # Pre-compute latent input scales and linear multistep coefficients
-            self.scheduler.set_timesteps(denoising_steps)
-            self.scheduler.configure()
-            self.denoising_steps = denoising_steps
+        self.scheduler.set_timesteps(denoising_steps)
+        self.scheduler.configure()
+        self.denoising_steps = denoising_steps
 
     def load_resources(self, image_height, image_width, batch_size):
         # If engine is built with static input shape, call this only once after engine build.

From 2151c79bf190cdd71794d224f8efea4f8b9cd207 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Wed, 8 Nov 2023 17:42:50 +0800
Subject: [PATCH 02/13] Tune ORTModule logging experience a bit (#18298)

### Tune logging experience a bit

After last time we update the ORTModule log experience, we found few
issues:
1. `INFO` level output too many things, including PyTorch exporter
verbose logs (tracing graphs) on every ranks. On this level, we only
want to
- Output a little bit more information to Users than `WARNING` level,
for example the memory recomputation recommendations or other
not-fully-ready features.
- Output a little bit more information for a quick diagnostic, collected
on rank-0 only.
2. ONNX Runtime logging filter during graph build, session init
sometimes will hide the issues (for example segement fault), there is no
useful information in `WARNING`/`INFO` for users to report to us. This
is not good!
3. Some of our devs like using `pdb` to debug Python code, but if we add
`import pdb; pdb.set_trace()` in models' code might hang when they use
`INFO` or `WARNING`, where exporter happens and all output got
redirected due to log filtering. The only workaround is to switch to
VERBOSE, which output toooooooooooo many logs.

The corresponding changes proposed here are:
1. For `INFO` logging,
    - We only logs rank-0.
- We restricted the ORT backend logging level to be WARNING in this
case, because ORT backend code output way too many logs that should be
under verbose, while we cannot guarantee we can get them cleaned up
immediately once they are added.
- We output the PyTorch exporter verbose log (including tracing graph),
which is useful for a quick diagnostic when an issue happens.
2. Remove all logging filtering on ORT backend, then the segment fault
issue details will not be hidden once it happens again.
 3. Introduced a `DEVINFO` logging,
     - Log logs on all ranks
     - Log ORT backend logging level INFO
- PyTorch exporter logging filtering are all turned OFF (to unblock the
pdb debugging).
4. Currently, to use Memory Optimizer, need use DEVINFO (which will
output ORT backend INFO log). So update memory optimizer document to
reflect this. https://github.com/microsoft/onnxruntime/pull/17481 will
update the requirement back to INFO for show memory optimization infos.

You can check
https://github.com/microsoft/onnxruntime/blob/pengwa/devinfo_level/docs/ORTModule_Training_Guidelines.md#log-level-explanations
for a better view of different log levels.

This PR also extract some changes from a bigger one
https://github.com/microsoft/onnxruntime/pull/17481, to reduce its
complexity for review.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: mindest <30493312+mindest@users.noreply.github.com>
---
 docs/Memory_Optimizer.md                      |  8 +-
 docs/ORTModule_Training_Guidelines.md         | 84 +++++++++++++++++++
 .../ortmodule/_graph_execution_manager.py     |  1 -
 .../training/ortmodule/_inference_manager.py  |  4 +-
 .../python/training/ortmodule/_logger.py      | 19 +++--
 .../training/ortmodule/_training_manager.py   |  4 +-
 .../python/training/ortmodule/options.py      |  5 --
 7 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md
index 3ef3a575f20a1..e9ceae00a684d 100644
--- a/docs/Memory_Optimizer.md
+++ b/docs/Memory_Optimizer.md
@@ -20,10 +20,10 @@ Not all models and recipes need this optimizer technique. Imagine if your traini
 ## Quick trial
 
 1. Make sure ONNX Runtime training wheel is installed and correctly configured.
-2. Integrate models using `ORTModule`, be noted log_level should be equal or lower than INFO.
-	> ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.INFO))
-3. Run the training as usual and redirect all outputs into log file; then stop it after training few steps.
-4. Check the logging file, search "Summary", you could possibly find something like this:
+2. Integrate models using `ORTModule`, be noted log_level should be equal to or lower than DEVINFO.
+	> ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.DEVINFO))
+3. Run the training as usual and redirect all outputs into the log file; then stop it after training a few steps.
+4. Check the logging file, and search "Summary", you could find something like this:
 	```
 	MemoryOptimizer Summary:
 	User config:
diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index 5350988b20964..12733c3551704 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -49,6 +49,90 @@ More options for **developers**.
 ```
 Check [DebugOptions implementation](../orttraining/orttraining/python/training/ortmodule/options.py) for more details.
 
+#### Log Level Explanations
+
+<table>
+<tr>
+<th style="width:20%">Log Level</th>
+<th style="width:80%">Description</th>
+</tr>
+<tr>
+<td>
+
+`FATAL` | `ERROR` | `WARNING` (For Users)
+
+<sup>`WARNING` is the default and recommended level for
+<br>users.</sup>
+</td>
+<td>
+
+- ONNX Runtime backend log level - `FATAL` | `ERROR` | `WARNING`.
+- ORTModule log level - `FATAL` | `ERROR` | `WARNING`.
+- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only).
+- PyTorch exporter export logs filtering is `ON`.
+- PyTorch exporter verbose logs (including tracing graph) filtering is `ON`.
+
+</td>
+</tr>
+<tr>
+<td>
+
+`INFO` (For Users | ORT Developers)
+
+<sup>`INFO` is used for collecting experimental
+<br>feature stats, or a little bit more error messages.</sup>
+</td>
+<td>
+
+- ONNX Runtime backend log level - `WARNING`.
+- ORTModule log level - `INFO`.
+- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only).
+- PyTorch exporter export logs filtering is `ON`.
+- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`.
+
+</td>
+</tr>
+<tr>
+<td>
+
+`DEVINFO` (For ORT Developers)
+
+<sup>`DEVINFO` is the recommended level for
+<br>debugging purposes.</sup>
+</td>
+<td>
+
+- ONNX Runtime backend log level - `INFO`.
+- ORTModule log level - `INFO`.
+- Rank-0 log filtering is `OFF` (e.g. logging on all ranks).
+- PyTorch exporter export logs filtering is `OFF`.
+- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+`VERBOSE` (For ORT Developers)
+
+<sup>`VERBOSE` is the last resort for debugging
+<br>hard problems.</sup>
+</td>
+<td>
+
+- ONNX Runtime backend log level - `VERBOSE`.
+- ORTModule log level - `VERBOSE`.
+- Rank-0 log filtering is `OFF` (e.g. logging on all ranks).
+- PyTorch exporter export logs filtering is `OFF`.
+- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`.
+
+</td>
+</tr>
+
+</table>
+
+
 ### 2.1 Environment Variables
 
 `ORTModule` provides environment variables targeting different use cases.
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 04820218b7c49..5eb1d9f382380 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -473,7 +473,6 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati
         return graph_transformer_config
 
     @_logger.TrackTime(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT)
-    @_logger.SuppressLogs(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT)
     def _initialize_graph_builder(self):
         """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder"""
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
index 8d8be81c549d1..6690af9b71bf1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
@@ -16,7 +16,7 @@
 from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy
 from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo
 from ._io import unflatten_user_output
-from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime
+from ._logger import ORTModuleInitPhase, TrackTime
 from ._utils import save_tuning_results, set_tuning_results
 from .options import DebugOptions, _SkipCheck
 
@@ -207,7 +207,6 @@ def forward(self, *inputs, **kwargs):
             return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs)
 
     @TrackTime(ORTModuleInitPhase.BUILD_GRAPH)
-    @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH)
     def _build_graph(self, graph_transformer_config):
         """Build an inference graph using the module_graph_builder"""
 
@@ -221,7 +220,6 @@ def _build_graph(self, graph_transformer_config):
             )
 
     @TrackTime(ORTModuleInitPhase.CREATE_SESSION)
-    @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION)
     def _create_execution_agent(self):
         """Creates an InferenceAgent that can run forward graph on an inference model"""
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index e075ced8eaac2..0728ebdf19af8 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -21,15 +21,18 @@
 
 class LogLevel(IntEnum):
     VERBOSE = 0
-    INFO = 1
-    WARNING = 2
-    ERROR = 3
-    FATAL = 4
+    DEVINFO = 1  # For ORT developers.
+    INFO = 2  # For ORT users.
+    WARNING = 3
+    ERROR = 4
+    FATAL = 5
 
 
 ORTMODULE_LOG_LEVEL_MAP: Dict[LogLevel, List[int]] = {
     LogLevel.VERBOSE: [Severity.VERBOSE, logging.DEBUG],
-    LogLevel.INFO: [Severity.INFO, logging.INFO],
+    LogLevel.DEVINFO: [Severity.INFO, logging.INFO],
+    # ONNX Runtime has too many INFO logs, so we map it to WARNING for a better user experience.
+    LogLevel.INFO: [Severity.WARNING, logging.INFO],
     LogLevel.WARNING: [Severity.WARNING, logging.WARNING],
     LogLevel.ERROR: [Severity.ERROR, logging.ERROR],
     LogLevel.FATAL: [Severity.FATAL, logging.FATAL],
@@ -48,13 +51,13 @@ def configure_ortmodule_logger(log_level: LogLevel) -> logging.Logger:
     """Configure the logger for ortmodule according to following rules.
     1. If multiple processes are used, the rank will be appended
        to the logger name.
-    2. If the log level is greater than info, the logger will be
+    2. If the log level is equal to or greater than INFO, the logger will be
        disabled for non-zero ranks.
     """
     rank_info = f".rank-{get_rank()}" if get_world_size() > 1 else ""
     logger = logging.getLogger(f"orttraining{rank_info}")
-    # Disable the logger for non-zero ranks when level > info
-    logger.disabled = log_level > LogLevel.INFO and get_rank() != 0
+    # Disable the logger for non-zero ranks when level >= INFO
+    logger.disabled = log_level >= LogLevel.INFO and get_rank() != 0
     logger.setLevel(ortmodule_loglevel_to_python_loglevel(log_level))
     return logger
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index e0f11e5aa407e..bafb64235546b 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -18,7 +18,7 @@
 from ._gradient_accumulation_manager import GradientAccumulationManager
 from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo
 from ._io import _FlattenedModule, _InputInfo, unflatten_user_output
-from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime
+from ._logger import ORTModuleInitPhase, TrackTime
 from ._runtime_inspector import Phase
 from ._utils import save_tuning_results, set_tuning_results
 from .graph_optimizer_registry import GraphOptimizerRegistry
@@ -358,7 +358,6 @@ def forward(self, *inputs, **kwargs):
             return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs)
 
     @TrackTime(ORTModuleInitPhase.BUILD_GRAPH)
-    @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH)
     def _build_graph(self, graph_transformer_config):
         """Build an optimized gradient graph using the module_graph_builder"""
 
@@ -401,7 +400,6 @@ def _build_graph(self, graph_transformer_config):
                 self._gradient_map.append(-1)
 
     @TrackTime(ORTModuleInitPhase.CREATE_SESSION)
-    @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION)
     def _create_execution_agent(self):
         """Creates a TrainingAgent that can run the forward and backward graph on the training model"""
 
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index ff0cde37195cb..cddd9cd440b28 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -167,11 +167,6 @@ def torch_exporter_filter(self):
     @property
     def onnxruntime_log_filter(self):
         """Accessor for the filter onnxruntime logs configuration."""
-        if self.log_level >= LogLevel.INFO:
-            return [
-                "CleanUnusedInitializersAndNodeArgs] Removing initializer",
-                "Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED",
-            ]
         return None
 
 
From 34f77eaa243ed16bbcea8fa585c9f89539488b27 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 8 Nov 2023 08:40:02 -0800
Subject: [PATCH 03/13] bfloat16 support for quickgelugrad (#18336)

### Description
<!-- Describe your changes. -->

Registers BFloat16 datatype as valid input type for CUDA QuickGeluGrad
Kernel.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Enabling `meta-llama/Llama-2-70b` to be finetuned with ONNX Runtime
training.

---------

Co-authored-by: Prathik Rao <prathikrao@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
---
 .../training_ops/cuda/activation/activations_grad.cc       | 6 +++++-
 .../training_ops/cuda/activation/activations_grad_impl.cu  | 7 ++++---
 .../orttraining/training_ops/cuda/cuda_training_kernels.cc | 2 ++
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc
index 7fde69d758ca9..98e3b878c9e0e 100644
--- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc
@@ -43,11 +43,15 @@ namespace cuda {
   ACTIVATION_GRAD_OP_TYPED(name, ver, domain, float)     \
   ACTIVATION_GRAD_OP_TYPED(name, ver, domain, double)
 
+#define ACTIVATION_GRAD_OP_HFDX(name, ver, domain) \
+  ACTIVATION_GRAD_OP_HFD(name, ver, domain)        \
+  ACTIVATION_GRAD_OP_TYPED(name, ver, domain, BFloat16)
+
 ACTIVATION_GRAD_OP_HFD(GeluGrad, 1, kMSDomain);
 ACTIVATION_GRAD_OP_HFD(FastGeluGrad, 1, kMSDomain);
 ACTIVATION_GRAD_OP_HFD(ReluGrad, 1, kMSDomain);
 ACTIVATION_GRAD_OP_HFD(SigmoidGrad, 1, kMSDomain);
-ACTIVATION_GRAD_OP_HFD(QuickGeluGrad, 1, kMSDomain);
+ACTIVATION_GRAD_OP_HFDX(QuickGeluGrad, 1, kMSDomain);
 ACTIVATION_GRAD_OP_HFD(TanhGrad, 1, kMSDomain);
 ACTIVATION_GRAD_OP_HFD(LeakyReluGrad, 1, kMSDomain);
 
diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu
index 164aba866722e..dd6a44b9e3b56 100644
--- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu
@@ -83,14 +83,15 @@ struct OP_LeakyReluGrad : public CtxLeakyReluGrad {
 #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(name, T) \
   template void Impl_##name<T>(cudaStream_t stream, const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count);
 
-#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(x) \
+#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFDX(x) \
   SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, half)     \
   SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, float)    \
-  SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, double)
+  SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, double)  \
+  SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, BFloat16)
 
 #define ACTIVATION_GRAD_OP_NAME(name) \
   BINARY_ELEMENTWISE_IMPL(name);      \
-  SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(name)
+  SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFDX(name)
 
 ACTIVATION_GRAD_OPS()
 #undef ACTIVATION_GRAD_OP_NAME
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index ae4f48b6b49a2..eeaa51c4dc1d8 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -121,6 +121,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGeluGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGeluGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, QuickGeluGrad);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, TanhGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, TanhGrad);
@@ -378,6 +379,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGeluGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGeluGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, QuickGeluGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, TanhGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, TanhGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, TanhGrad)>,

From c8def0cc5193783492c0ed153f0d71527a80857e Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Wed, 8 Nov 2023 09:36:28 -0800
Subject: [PATCH 04/13] Add LLaMA GQA ragged batching (#18337)

This PR updates replacing MHA with GQA and updates the LLaMA scripts for
the modified GQA op. It is related to the changes in [this
PR](https://github.com/microsoft/onnxruntime/pull/18283).

### Motivation and Context
This PR allows us to run LLaMA with the GQA op end-to-end using ragged
batching (i.e. batched inputs of different lengths).
---
 .../tools/transformers/convert_generation.py  | 119 +++++++++++++-----
 .../tools/transformers/models/llama/README.md |  97 ++++++++++++--
 .../transformers/models/llama/benchmark.py    |  22 ++--
 .../models/llama/convert_to_onnx.py           |  43 ++++---
 .../transformers/models/llama/llama_inputs.py |  47 +++----
 .../transformers/models/llama/llama_parity.py |  29 +++--
 .../transformers/models/llama/llama_torch.py  |   6 +-
 7 files changed, 257 insertions(+), 106 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 7aca5e8526a23..b59af41c49df7 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1272,39 +1272,96 @@ def find_past_seq_len_usage(subg: GraphProto):
     return tensor_names_to_rename, nodes_to_remove
 
 
-def replace_mha_with_gqa(model: OnnxModel, past_seq_len_input: str, kv_num_heads: int = 0, world_size: int = 1):
-    past_seq_len = past_seq_len_input
-    if past_seq_len not in model.get_graphs_input_names():
-        # Add model input for past sequence length
-        new_input = onnx.helper.make_tensor_value_info(past_seq_len, onnx.TensorProto.INT64, shape=[1])
-        model.model.graph.input.append(new_input)
+def replace_mha_with_gqa(model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1):
+    # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes
+    #
+    #                attention_mask
+    #               /              \
+    #          ReduceSum          Shape
+    #              |                |
+    #             Sub             Gather
+    #              |                |
+    #          seqlens_k   total_sequence_length
+    #              |                |
+    #        Cast to int32    Cast to int32
+
+    model.add_initializer(
+        onnx.helper.make_tensor(
+            name="one",
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1],
+        )
+    )
+    reduce_sum_node = onnx.helper.make_node(
+        "ReduceSum",
+        inputs=[attn_mask, "one"],
+        outputs=[attn_mask + "_row_sums"],
+        name=model.create_node_name("ReduceSum"),
+    )
+    sub_node = onnx.helper.make_node(
+        "Sub",
+        inputs=[attn_mask + "_row_sums", "one"],
+        outputs=["seqlens_k_int64"],
+        name=model.create_node_name("Sub"),
+    )
+    seqlen_k_cast_node = onnx.helper.make_node(
+        "Cast",
+        inputs=["seqlens_k_int64"],
+        outputs=["seqlens_k"],
+        name=model.create_node_name("Cast"),
+        to=TensorProto.INT32,
+    )
+    shape_node = onnx.helper.make_node(
+        "Shape",
+        inputs=[attn_mask],
+        outputs=[attn_mask + "_shape"],
+        name=model.create_node_name("Shape"),
+    )
+    gather_node = onnx.helper.make_node(
+        "Gather",
+        inputs=[attn_mask + "_shape", "one"],
+        outputs=["total_seq_len_int64"],
+        name=model.create_node_name("Gather"),
+        axis=0,
+    )
+    total_seqlen_cast_node = onnx.helper.make_node(
+        "Cast",
+        inputs=["total_seq_len_int64"],
+        outputs=["total_seq_len"],
+        name=model.create_node_name("Cast"),
+        to=TensorProto.INT32,
+    )
+    model.model.graph.node.extend(
+        [reduce_sum_node, sub_node, seqlen_k_cast_node, shape_node, gather_node, total_seqlen_cast_node]
+    )
 
     # Replace MultiHeadAttention with GroupQueryAttention
-    for node in model.model.graph.node:
-        if node.op_type == "MultiHeadAttention":
-            num_heads_mha = 0
-            for att in node.attribute:
-                if att.name == "num_heads":
-                    num_heads_mha = att.i
-            gqa_node = onnx.helper.make_node(
-                "GroupQueryAttention",
-                inputs=[
-                    node.input[0],  # query
-                    node.input[1],  # key
-                    node.input[2],  # value
-                    node.input[6],  # past_key
-                    node.input[7],  # past_value
-                    past_seq_len,  # past_sequence_length
-                ],
-                outputs=node.output,
-                name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
-                domain="com.microsoft",
-                num_heads=num_heads_mha // world_size,
-                kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
-                is_past_bsnh=0,
-            )
-            model.model.graph.node.remove(node)
-            model.model.graph.node.extend([gqa_node])
+    mha_nodes = list(filter(lambda node: node.op_type == "MultiHeadAttention", model.model.graph.node))
+    for node in mha_nodes:
+        num_heads_mha = 0
+        for att in node.attribute:
+            if att.name == "num_heads":
+                num_heads_mha = att.i
+        gqa_node = onnx.helper.make_node(
+            "GroupQueryAttention",
+            inputs=[
+                node.input[0],  # query
+                node.input[1],  # key
+                node.input[2],  # value
+                node.input[6],  # past_key
+                node.input[7],  # past_value
+                "seqlens_k",  # seqlens_k (for attention_mask)
+                "total_seq_len",  # total_seq_len (for attention_mask)
+            ],
+            outputs=node.output,
+            name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
+            domain="com.microsoft",
+            num_heads=num_heads_mha // world_size,
+            kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+        )
+        model.model.graph.node.remove(node)
+        model.model.graph.node.extend([gqa_node])
     return model
 
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index 1bb6940d1cd74..0c6f830ed26b0 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -117,7 +117,7 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32-cpu --precision fp32 --execution_provider cpu
 ```
 
-Export for FP16 CUDA
+Export for FP16 CUDA (with MultiHeadAttention)
 ```
 # From source:
 $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda
@@ -126,6 +126,63 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda
 ```
 
+Export for FP16 CUDA (with GroupQueryAttention)
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --use_gqa
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --use_gqa
+```
+
+Note: GroupQueryAttention currently runs on Linux for FP16 CUDA and INT4 CUDA models, and it can provide faster inference than MultiHeadAttention, especially for large sequence lengths (e.g. 1024 or larger). For the best performance, you should pre-allocate the KV cache buffers to have size `(batch_size, num_heads, max_sequence_length, head_size)` so that the past KV and present KV caches share the same memory. You also need to bind them with ONNX Runtime's [IO binding](https://onnxruntime.ai/docs/api/python/api_summary.html#iobinding).
+
+Here is an example of how you can bind directly to `torch.tensor` objects:
+```
+# Assumes all inputs and outputs to the model are pre-allocated with the correct shapes in GPU memory
+
+# Bind inputs
+for k, v in inputs.items():
+    io_binding.bind_input(
+        name=k,
+        device_type="cuda",
+        device_id=0,
+        element_type=np.float16,
+        shape=tuple(v.shape),
+        buffer_ptr=v.data_ptr()
+    )
+
+# Bind outputs
+for output in model.get_outputs():
+    name = output.name
+    if "present" in name:
+        # Bind KV cache outputs to KV cache inputs
+        v = inputs[name.replace("present", "past_key_values")]
+        io_binding.bind_output(
+            name=name,
+            device_type="cuda",
+            device_id=0,
+            element_type=np.float16,
+            shape=tuple(v.shape),
+            buffer_ptr=v.data_ptr()
+        )
+    else:
+        # Bind other outputs as actual outputs
+        v = outputs[name]
+        io_binding.bind_output(
+            name=name,
+            device_type="cuda",
+            device_id=0,
+            element_type=np.float16,
+            shape=tuple(v.shape),
+            buffer_ptr=v.data_ptr()
+        )
+
+io_binding.synchronize_inputs()
+sess.run_with_iobinding(io_binding)
+io_binding.synchronize_outputs()
+```
+
 Export for INT8 CPU (SmoothQuant)
 ```
 # From source:
@@ -149,12 +206,14 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 Export for INT4 CUDA
 ```
 # From source:
-$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda --use_gqa
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda --use_gqa
 ```
 
+Note: See the FP16 CUDA notes about GroupQueryAttention. The `--use_gqa` flag is optional.
+
 Export for INT4 CPU
 ```
 # From source:
@@ -168,13 +227,13 @@ Export LLaMA-2 70B sharded model into 4 partitions
 ```
 # From source:
 # 1. Install necessary packages from requirements-70b-model.txt
+$ pip install -r requirements-70b-model.txt
 
 # 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command:
-$ ./build.sh --config RelWithDebInfo --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/
+$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/
 
 # 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command:
-$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-dis --precision fp16 --execution_provider cuda
-
+$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
 ```
 
 ## Benchmark LLaMA-2
@@ -220,7 +279,20 @@ python3 -m models.llama.benchmark \
     --auth
 ```
 
-4. ONNX Runtime, FP32, Microsoft custom export
+4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx
+```
+python3 -m models.llama.benchmark \
+    --benchmark-type hf-ort \
+    --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cuda \
+    --auth
+```
+
+5. ONNX Runtime, FP32, Microsoft custom export
 ```
 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
@@ -232,7 +304,7 @@ python3 -m models.llama.benchmark \
     --device cpu
 ```
 
-5. ONNX Runtime, FP16, Microsoft custom export
+6. ONNX Runtime, FP16, Microsoft custom export
 ```
 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
@@ -244,7 +316,7 @@ python3 -m models.llama.benchmark \
     --device cuda
 ```
 
-6. ONNX Runtime, FP32, convert_to_onnx, use 2nd GPU
+7. ONNX Runtime, FP32, convert_to_onnx, use 2nd GPU
 ```
 CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
@@ -256,7 +328,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \
     --device cpu
 ```
 
-7. ONNX Runtime, FP16, convert_to_onnx, use 5th GPU
+8. ONNX Runtime, FP16, convert_to_onnx, use 5th GPU
 ```
 CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
@@ -283,5 +355,8 @@ python3 -m models.llama.benchmark_all \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
-    --device cuda
+    --device cuda \
+    --warmup-runs 5 \
+    --num-runs 1000 \
+    --timeout 60  # number of minutes before moving to the next benchmark
 ```
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index be678931de5d1..021b0dd03a9db 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -11,6 +11,7 @@
 import onnx
 import psutil
 import torch
+from benchmark_helper import measure_memory, setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
     add_io_bindings,
@@ -22,10 +23,9 @@
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
 from tqdm import trange
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 import onnxruntime as ort
-from onnxruntime.transformers.benchmark_helper import measure_memory, setup_logger
 
 logger = logging.getLogger(__name__)
 
@@ -107,6 +107,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
                 past_seq_len=0,
                 max_seq_len=max_seq_len,
                 use_fp16=args.use_fp16,
+                use_gqa=args.use_gqa,
                 engine="pt",
                 return_dict=True,
             )
@@ -118,6 +119,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
                 past_seq_len=args.sequence_length,
                 max_seq_len=max_seq_len,
                 use_fp16=args.use_fp16,
+                use_gqa=args.use_gqa,
                 engine="pt",
                 return_dict=True,
             )
@@ -132,6 +134,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             past_seq_len=0,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
+            use_gqa=args.use_gqa,
             engine="ort",
             return_dict=True,
             world_size=args.world_size,
@@ -144,6 +147,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             past_seq_len=args.sequence_length,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
+            use_gqa=args.use_gqa,
             engine="ort",
             return_dict=True,
             world_size=args.world_size,
@@ -160,6 +164,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             seq_len=args.sequence_length,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
+            use_gqa=args.use_gqa,
             split_kv=split_kv,
         )
         iter_inputs = get_msft_sample_inputs(
@@ -169,6 +174,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             seq_len=1,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
+            use_gqa=args.use_gqa,
             split_kv=split_kv,
         )
 
@@ -192,7 +198,7 @@ def get_model(args: argparse.Namespace):
     if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         source = args.hf_pt_dir_path if args.hf_pt_dir_path else args.model_name
         start_time = time.time()
-        model = LlamaForCausalLM.from_pretrained(
+        model = AutoModelForCausalLM.from_pretrained(
             source,
             torch_dtype=torch.float16 if args.use_fp16 else torch.float32,
             use_auth_token=args.auth,
@@ -456,7 +462,7 @@ def prepare_ort_inputs(inputs, kv_cache_ortvalues):
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
             io_binding, kv_cache_ortvalues = add_io_bindings(
-                model, inputs, args.device, int(args.rank), kv_cache_ortvalues
+                model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues
             )
             setattr(args, "io_binding", io_binding)  # noqa: B010
             return io_binding, kv_cache_ortvalues
@@ -650,8 +656,8 @@ def main():
 
     args.rank = rank
     args.world_size = world_size
-    tokenizer = LlamaTokenizer.from_pretrained(args.model_name)
-    config = LlamaConfig.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    config = AutoConfig.from_pretrained(args.model_name)
     target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device
     use_fp16 = args.precision == "fp16"
 
@@ -670,9 +676,9 @@ def main():
         gqa_nodes = list(filter(lambda node: node.op_type == "GroupQueryAttention", onnx_model.graph.node))
 
         use_buffer_share = use_fp16 and len(gqa_nodes) > 0 and args.device != "cpu"
-        setattr(args, "past_present_share_buffer", use_buffer_share)  # noqa: B010
+        setattr(args, "use_gqa", use_buffer_share)  # noqa: B010
     else:
-        setattr(args, "past_present_share_buffer", False)  # noqa: B010
+        setattr(args, "use_gqa", False)  # noqa: B010
 
     # Measure prompt cost (init_inputs) and generated token cost (iter_inputs)
     for batch_size, sequence_length in itertools.product(args.batch_sizes, args.sequence_lengths):
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index b0e0b41e75d3d..c9c7f4d39d423 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -7,6 +7,8 @@
 
 import onnx
 import torch
+from benchmark_helper import Precision, prepare_environment, setup_logger
+from convert_generation import replace_mha_with_gqa
 from dist_settings import barrier, get_rank, get_size, init_dist
 from llama_inputs import get_merged_sample_with_past_kv_inputs, get_sample_inputs, get_sample_with_past_kv_inputs
 from llama_parity import main as parity_check
@@ -14,12 +16,10 @@
 from onnx_model import OnnxModel
 from optimizer import optimize_model
 from packaging import version
-from transformers import LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 
 from onnxruntime import quantization as ort_quantization
 from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
-from onnxruntime.transformers.benchmark_helper import Precision, prepare_environment, setup_logger
-from onnxruntime.transformers.convert_generation import replace_mha_with_gqa
 
 logger = logging.getLogger("")
 init_dist()
@@ -133,7 +133,7 @@ def save_onnx_model(onnx_model: onnx.ModelProto, output_path: str, data_path: st
 # temp_dir.cleanup()
 #
 def run_dynamo_export(
-    args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1
 ):
     from torch._dynamo import config
 
@@ -194,7 +194,7 @@ def _prepare_dir(dir_path):
 
 
 def run_torchscript_separate_export(
-    args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1
 ):
     # Dummy values for export
     batch_size, sequence_length = 2, 8
@@ -313,7 +313,7 @@ def run_torchscript_separate_export(
 
 
 def run_torchscript_merged_export(
-    args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1
 ):
     # Dummy values for export
     batch_size, sequence_length, past_sequence_length = 2, 8, 0
@@ -391,7 +391,7 @@ def run_torchscript_merged_export(
 
 
 # Optimize the model as FP32
-def optimize_export(config: LlamaConfig, input_path: str, output_path: str):
+def optimize_export(config: AutoConfig, input_path: str, output_path: str):
     from fusion_options import FusionOptions
 
     optimization_options = FusionOptions("gpt2")
@@ -411,7 +411,7 @@ def optimize_export(config: LlamaConfig, input_path: str, output_path: str):
 
 
 def convert_to_float16(
-    args: argparse.Namespace, config: LlamaConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
 ):
     decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx")
     decoder_with_past_model_fp16_path = os.path.join(
@@ -427,7 +427,8 @@ def convert_to_float16(
         if os.path.exists(fp32_path):
             model = OnnxModel(onnx.load_model(fp32_path, load_external_data=True))
             model.convert_float_to_float16(keep_io_types=False)
-            model = use_group_query_attention(config, model, world_size)
+            if args.use_gqa:
+                model = use_group_query_attention(config, model, world_size)
             model.save_model_to_file(fp16_path, use_external_data_format=True)
             del model
             logger.info(f"The ONNX model at {fp32_path} has been converted to float16 and saved at {fp16_path}!")
@@ -437,11 +438,9 @@ def convert_to_float16(
     return new_paths
 
 
-def use_group_query_attention(config: LlamaConfig, fp16_model_opt: OnnxModel, world_size: int = 1):
-    # Replace MultiHeadAttention with GroupQueryAttention and remove attention mask nodes
-    fp16_model_opt = replace_mha_with_gqa(
-        fp16_model_opt, "past_sequence_length", config.num_key_value_heads, world_size
-    )
+def use_group_query_attention(config: AutoConfig, fp16_model_opt: OnnxModel, world_size: int = 1):
+    # Replace MultiHeadAttention with GroupQueryAttention
+    fp16_model_opt = replace_mha_with_gqa(fp16_model_opt, "attention_mask", config.num_key_value_heads, world_size)
     fp16_model_opt.prune_graph()
     fp16_model_opt.update_graph(allow_remove_graph_inputs=True)
     return fp16_model_opt
@@ -520,8 +519,8 @@ def smooth_quant(
 
     logger.info(f"The {args.model_name} ONNX model has been successfully quantized to int8!")
 
-    logger.info(f"Removing {args.nc_workspace}")
-    os.system(f"rm -R {args.nc_workspace}")
+    logger.warning(f"Removing {args.nc_workspace}")
+    shutil.rmtree(args.nc_workspace)
 
 
 def remove_existing_model(model_path: str):
@@ -594,6 +593,14 @@ def get_args():
     )
     parser.set_defaults(reexport=False)
 
+    parser.add_argument(
+        "--use_gqa",
+        required=False,
+        action="store_true",
+        help="Use GroupQueryAttention instead of MultiHeadAttention",
+    )
+    parser.set_defaults(use_gqa=False)
+
     parser.add_argument(
         "--no_merged",
         required=False,
@@ -747,7 +754,7 @@ def main():
 
     location = args.original_model_name if use_auth_token else args.input
 
-    # use cuda for Llama-2-70b to speedup export, other models use CPU by default
+    # Use CUDA for LLaMA-2-70B to speed up export and CPU for other models
     l_config, llama = setup_torch_model(
         args, location, use_auth_token, device=args.device if args.model_name == "Llama-2-70b-hf" else None
     )
@@ -944,6 +951,8 @@ def main():
             parity_cmd.append("--use_past_kv")
         if "merged" in filename:
             parity_cmd.append("--merged")
+        if args.use_gqa:
+            parity_cmd.append("--use_gqa")
 
         try:
             logger.debug(f"check parity with cmd: {parity_cmd}")
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index 6530eead55f03..bae1ae82e8f7e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import torch
-from transformers import LlamaConfig
+from transformers import AutoConfig
 
 from onnxruntime import InferenceSession, OrtValue
 
@@ -24,7 +24,7 @@ def get_position_ids(attention_mask: torch.Tensor, use_past_kv: bool):
 #   attention_mask: (batch_size, sequence_length)
 #   position_ids: (batch_size, sequence_length)
 def get_sample_inputs(
-    config: LlamaConfig,
+    config: AutoConfig,
     device: torch.device,
     batch_size: int,
     seq_len: int,
@@ -59,7 +59,7 @@ def get_sample_inputs(
 #   past_key: (batch_size, num_heads, past_sequence_length, head_size)
 #   past_value: (batch_size, num_heads, past_sequence_length, head_size)
 def get_sample_with_past_kv_inputs(
-    config: LlamaConfig,
+    config: AutoConfig,
     device: torch.device,
     batch_size: int,
     past_seq_len: int,
@@ -115,13 +115,14 @@ def get_sample_with_past_kv_inputs(
 #      For models with GQA, kv_sequence_length = max_sequence_length
 #      For models without GQA, kv_sequence_length = past_sequence_length
 def get_merged_sample_with_past_kv_inputs(
-    config: LlamaConfig,
+    config: AutoConfig,
     device: torch.device,
     batch_size: int,
     seq_len: int,
     past_seq_len: int,
     max_seq_len: int,
     use_fp16: bool = False,
+    use_gqa: bool = False,
     engine: str = "pt",
     return_dict: bool = False,
     world_size: int = 1,
@@ -156,9 +157,7 @@ def get_merged_sample_with_past_kv_inputs(
         assert isinstance(past_kv, dict)
         inputs.update(past_kv)
 
-        if use_fp16:  # If model has GQA
-            del inputs["attention_mask"]
-            inputs["past_sequence_length"] = np.array([past_seq_len], dtype=np.int64)
+        if use_gqa:
             inputs = enable_past_present_share_buffer(inputs, past_seq_len, max_seq_len)
 
     else:
@@ -170,12 +169,13 @@ def get_merged_sample_with_past_kv_inputs(
 
 # Inputs for Microsoft export from https://github.com/microsoft/Llama-2-Onnx
 def get_msft_sample_inputs(
-    config: LlamaConfig,
+    config: AutoConfig,
     batch_size: int,
     past_seq_len: int,
     seq_len: int,
     max_seq_len: int,
     use_fp16: bool,
+    use_gqa: bool,
     split_kv: bool,
 ):
     np_dtype = np.float16 if use_fp16 else np.float32
@@ -213,8 +213,7 @@ def get_msft_sample_inputs(
                 }
             )
 
-        if use_fp16:  # If model has GQA
-            del ort_inputs["attn_mask"]
+        if use_gqa:
             ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len)
 
     return ort_inputs
@@ -222,7 +221,7 @@ def get_msft_sample_inputs(
 
 # Create past_key_values
 # Each is of shape (batch_size, num_heads, past_sequence_length, head_size)
-def get_past_kv_inputs(config: LlamaConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1):
+def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1):
     num_heads, head_size = config.num_key_value_heads // world_size, config.hidden_size // config.num_attention_heads
     torch_dtype = torch.float16 if use_fp16 else torch.float32
     past_kv = [
@@ -247,8 +246,7 @@ def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tenso
 # Format PyTorch inputs to ONNX Runtime inputs
 def convert_inputs_for_ort(
     pt_inputs: dict,
-    use_fp16: bool,
-    use_buffer_share: bool = False,
+    use_gqa: bool = False,
     past_seq_len: int = 0,
     max_seq_len: int = 2048,
     device: str = "",
@@ -260,17 +258,11 @@ def convert_inputs_for_ort(
             ort_inputs[k] = v
         elif k == "past_key_values":
             ort_inputs.update(flatten_past_kv_inputs(v))
-        elif k == "attention_mask" and use_fp16 and use_buffer_share:
-            # Skip because FP16 model has GroupQueryAttention, uses buffer sharing,
-            # and GQA supports a causal mask by default
-
-            # Instead, add the past sequence length input for GQA
-            ort_inputs["past_sequence_length"] = np.array([past_seq_len], dtype=np.int64)
         else:
             ort_inputs[k] = v.detach().cpu().numpy()
 
-    # Reshape kv caches if using past-present-share-buffer
-    if use_buffer_share and device != "" and device != "cpu" and device_id > -1:
+    # Reshape KV caches if using past-present-share-buffer
+    if use_gqa and device != "" and device != "cpu" and device_id > -1:
         ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len)
 
     return ort_inputs
@@ -289,17 +281,14 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se
 
 
 # Add IO bindings for execution providers
-def add_io_bindings(model: InferenceSession, ort_inputs: dict, device: str, device_id: int, kv_cache_ortvalues: dict):
-    use_fp16 = False
+def add_io_bindings(
+    model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict
+):
     io_binding = model.io_binding()
 
     for k, v in ort_inputs.items():
-        # Detect if model is in FP16
-        if v.dtype == np.float16:
-            use_fp16 = True
-
         # Bind OrtValue inputs to device
-        if use_fp16 and ("cache" in k or "past_key_values" in k):
+        if use_gqa and ("cache" in k or "past_key_values" in k):
             if k not in kv_cache_ortvalues:
                 v_device = OrtValue.ortvalue_from_numpy(v, device_type=device, device_id=device_id)
                 io_binding.bind_ortvalue_input(k, v_device)
@@ -313,7 +302,7 @@ def add_io_bindings(model: InferenceSession, ort_inputs: dict, device: str, devi
 
     for output in model.get_outputs():
         name = output.name
-        if use_fp16 and ("out" in name or "present" in name):
+        if use_gqa and ("out" in name or "present" in name):
             # Bind present KV cache outputs to past KV cache inputs in order to buffer share
             input_name = name.replace("out", "cache").replace("present", "past_key_values")
             io_binding.bind_ortvalue_output(name, kv_cache_ortvalues[input_name])
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index 42581caf3bb9e..418a65325c8f0 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import torch
+from benchmark_helper import setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
     add_io_bindings,
@@ -15,10 +16,9 @@
     get_sample_with_past_kv_inputs,
 )
 from llama_torch import setup_torch_model
-from transformers import LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 
 import onnxruntime as ort
-from onnxruntime.transformers.benchmark_helper import setup_logger
 
 logger = logging.getLogger("")
 
@@ -30,7 +30,7 @@ def get_sequence_lengths(args: argparse.Namespace):
     return past_sequence_length, curr_sequence_length, max_sequence_length
 
 
-def get_inputs(args: argparse.Namespace, config: LlamaConfig):
+def get_inputs(args: argparse.Namespace, config: AutoConfig):
     # Dummy values for parity
     world_size = get_size()
     batch_size = 2
@@ -45,6 +45,7 @@ def get_inputs(args: argparse.Namespace, config: LlamaConfig):
             past_seq_len=past_sequence_length,
             max_seq_len=max_sequence_length,
             use_fp16=args.use_fp16,
+            use_gqa=args.use_gqa,
             return_dict=True,
             world_size=world_size,
         )
@@ -64,7 +65,9 @@ def get_inputs(args: argparse.Namespace, config: LlamaConfig):
     return inputs
 
 
-def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: LlamaForCausalLM, kv_cache_ortvalues: dict):
+def verify_parity(
+    args: argparse.Namespace, config: AutoConfig, pt_model: AutoModelForCausalLM, kv_cache_ortvalues: dict
+):
     inputs = get_inputs(args, config)
 
     # Run inference with PyTorch
@@ -82,8 +85,7 @@ def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: Llama
     past_sequence_length, _, max_sequence_length = get_sequence_lengths(args)
     inputs = convert_inputs_for_ort(
         inputs,
-        use_fp16=args.use_fp16,
-        use_buffer_share=args.use_fp16,
+        use_gqa=args.use_gqa,
         past_seq_len=past_sequence_length,
         max_seq_len=max_sequence_length,
         device=args.execution_provider,
@@ -102,7 +104,12 @@ def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: Llama
     # Add IO bindings for non-CPU execution providers
     if args.execution_provider != "cpu":
         io_binding, kv_cache_ortvalues = add_io_bindings(
-            ort_model, inputs, args.execution_provider, int(args.rank), kv_cache_ortvalues
+            ort_model,
+            inputs,
+            args.execution_provider,
+            int(args.rank),
+            args.use_gqa,
+            kv_cache_ortvalues,
         )
 
         io_binding.synchronize_inputs()
@@ -183,6 +190,14 @@ def get_args(argv: List[str]):
     )
     parser.set_defaults(use_past_kv=False)
 
+    parser.add_argument(
+        "-g",
+        "--use_gqa",
+        action="store_true",
+        help="Use if model has GroupQueryAttention",
+    )
+    parser.set_defaults(use_gqa=False)
+
     parser.add_argument(
         "--merged",
         action="store_true",
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index cf6406dde5be0..94e0397116d1c 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -3,7 +3,7 @@
 
 import torch
 from dist_settings import barrier, get_rank, get_size
-from transformers import LlamaConfig, LlamaForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 
 logger = logging.getLogger("")
 
@@ -19,9 +19,9 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32,
 
     for i in range(world_size):
         if i == rank % (world_size):
-            l_config = LlamaConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
+            l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
             l_config.use_cache = True
-            llama = LlamaForCausalLM.from_pretrained(
+            llama = AutoModelForCausalLM.from_pretrained(
                 location,
                 use_auth_token=use_auth_token,
                 config=l_config,

From a0eeeafa8050bc354fae279138147f3ea59f83eb Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 8 Nov 2023 10:06:15 -0800
Subject: [PATCH 05/13] [QNN EP] Session option for graph optimization (#18262)

### Description
Adds the QNN session option `htp_graph_finalization_optimization_mode`
to enable QNN graph optimizations at the expense of longer preparation
time.

### Motivation and Context
Allow enabling QNN graph optimizations per app/model.
---
 .../core/session/onnxruntime_c_api.h          |  9 ++-
 .../qnn/builder/onnx_ctx_model_helper.cc      | 42 +++++++-------
 .../qnn/builder/qnn_backend_manager.cc        | 35 ++++++++++--
 .../qnn/builder/qnn_backend_manager.h         |  3 +
 .../core/providers/qnn/builder/qnn_def.h      |  8 +++
 .../qnn/builder/qnn_graph_configs_helper.cc   | 43 ++++++++++++++
 .../qnn/builder/qnn_graph_configs_helper.h    | 56 +++++++++++++++++++
 .../core/providers/qnn/builder/qnn_model.cc   |  5 +-
 .../core/providers/qnn/builder/qnn_model.h    |  3 +-
 .../providers/qnn/qnn_execution_provider.cc   | 44 ++++++++++++++-
 .../providers/qnn/qnn_execution_provider.h    |  6 ++
 onnxruntime/test/onnx/main.cc                 | 14 ++++-
 .../test/perftest/command_args_parser.cc      |  3 +
 onnxruntime/test/perftest/ort_test_session.cc | 14 ++++-
 .../test/providers/qnn/qnn_basic_test.cc      | 22 +++++++-
 15 files changed, 270 insertions(+), 37 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
 create mode 100644 onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 729a302f3dd0f..1d02b72342722 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3598,12 +3598,17 @@ struct OrtApi {
    *   "rpc_control_latency": QNN RPC control latency.
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
    *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
-   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the Onnx skeleton model.
+   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
    *   0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
-   *   The path is relative path to the Onnx skeleton model file.
+   *   The path is relative path to the ONNX skeleton model file.
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.
+   *   "htp_graph_finalization_optimization_mode": Set the optimization mode for graph finalization on the HTP backend. Available options:
+   *     - "0": Default.
+   *     - "1": Faster preparation time, less optimal graph.
+   *     - "2": Longer preparation time, more optimal graph.
+   *     - "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific details.
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 7ccd765e65381..bd9986e661e21 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -86,32 +86,30 @@ Status QnnCacheModelHandler::GetEpContextFromGraph(const onnxruntime::GraphViewe
     return qnn_backend_manager->LoadCachedQnnContextFromBuffer(const_cast<char*>(context_binary.c_str()),
                                                                static_cast<uint64_t>(context_binary.length()),
                                                                qnn_model);
-  } else {
-    std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
+  }
 
-    std::string context_binary_path(std::filesystem::path(ctx_onnx_model_path).parent_path().string() +
-                                    "/" + external_qnn_context_binary_file_name);
-    size_t buffer_size{0};
-    std::ifstream cache_file(context_binary_path.c_str(), std::ifstream::binary);
-    ORT_RETURN_IF(!cache_file || !cache_file.good(), "Failed to open cache file.");
+  std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
 
-    cache_file.seekg(0, cache_file.end);
-    buffer_size = static_cast<size_t>(cache_file.tellg());
-    ORT_RETURN_IF(0 == buffer_size, "Empty cache file encountered.");
+  std::string context_binary_path(std::filesystem::path(ctx_onnx_model_path).parent_path().string() +
+                                  "/" + external_qnn_context_binary_file_name);
+  size_t buffer_size{0};
+  std::ifstream cache_file(context_binary_path.c_str(), std::ifstream::binary);
+  ORT_RETURN_IF(!cache_file || !cache_file.good(), "Failed to open cache file.");
 
-    cache_file.seekg(0, cache_file.beg);
-    std::unique_ptr<char[]> buffer = std::make_unique<char[]>(buffer_size);
-    ORT_RETURN_IF(nullptr == buffer, "Failed to allocate memory for cache file.");
-    // Load file into buffer
-    const auto& read_result = cache_file.read(buffer.get(), buffer_size);
-    ORT_RETURN_IF(!read_result, "Failed to read contents from cached context file.");
-    cache_file.close();
-    return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(),
-                                                               static_cast<uint64_t>(buffer_size),
-                                                               qnn_model);
-  }
+  cache_file.seekg(0, cache_file.end);
+  buffer_size = static_cast<size_t>(cache_file.tellg());
+  ORT_RETURN_IF(0 == buffer_size, "Empty cache file encountered.");
 
-  return Status::OK();
+  cache_file.seekg(0, cache_file.beg);
+  std::unique_ptr<char[]> buffer = std::make_unique<char[]>(buffer_size);
+  ORT_RETURN_IF(nullptr == buffer, "Failed to allocate memory for cache file.");
+  // Load file into buffer
+  const auto& read_result = cache_file.read(buffer.get(), buffer_size);
+  ORT_RETURN_IF(!read_result, "Failed to read contents from cached context file.");
+  cache_file.close();
+  return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(),
+                                                             static_cast<uint64_t>(buffer_size),
+                                                             qnn_model);
 }
 
 Status QnnCacheModelHandler::GetMetadataFromEpContextModel(const std::string& ctx_onnx_model_path,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index fa859ce81be98..dd56731ac9f7f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -570,8 +570,7 @@ Status QnnBackendManager::SetHtpPowerConfig() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
   // Get power client id
-  uint32_t powerconfig_client_id = 0;
-  status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &powerconfig_client_id);
+  status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_);
   ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed.");
 
   constexpr const int kNumConfigs = 1;
@@ -580,7 +579,7 @@ Status QnnBackendManager::SetHtpPowerConfig() {
   QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0];
   dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
   QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config;
-  dcvs_v3.contextId = powerconfig_client_id;
+  dcvs_v3.contextId = htp_power_config_client_id_;
   dcvs_v3.setSleepDisable = 0;
   dcvs_v3.sleepDisable = 0;
   dcvs_v3.setDcvsEnable = 1;
@@ -678,7 +677,7 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       break;
   }
   std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr_ = ObtainNullTermPtrVector(power_configs);
-  status = htp_perf_infra.setPowerConfig(powerconfig_client_id, perf_power_configs_ptr_.data());
+  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
   ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode.");
 
   // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
@@ -692,7 +691,7 @@ Status QnnBackendManager::SetHtpPowerConfig() {
     rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
     rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_;
     perf_power_configs_ptr_ = ObtainNullTermPtrVector(rpc_power_configs);
-    status = htp_perf_infra.setPowerConfig(powerconfig_client_id, perf_power_configs_ptr_.data());
+    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
     ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency.");
   }
 
@@ -713,12 +712,36 @@ void QnnBackendManager::Split(std::vector<std::string>& split_string,
   }
 }
 
+Status QnnBackendManager::DestroyHTPPowerConfigID() {
+  if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) {
+    return Status::OK();
+  }
+
+  QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+  auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+  ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+  auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+  ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+  QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
+  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_);
+  ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed.");
+  return Status::OK();
+}
+
 void QnnBackendManager::ReleaseResources() {
   if (!backend_setup_completed_) {
     return;
   }
 
-  auto result = ReleaseContext();
+  auto result = DestroyHTPPowerConfigID();
+  if (Status::OK() != result) {
+    ORT_THROW("Failed to DestroyHTPPowerConfigID.");
+  }
+
+  result = ReleaseContext();
   if (Status::OK() != result) {
     ORT_THROW("Failed to ReleaseContext.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 9cb6a322149b9..de5ccb5a28389 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -133,6 +133,8 @@ class QnnBackendManager {
 
   Status UnloadLib(void* handle);
 
+  Status DestroyHTPPowerConfigID();
+
   void* LibFunction(void* handle, const char* symbol, std::string& error_msg);
 
   template <class T>
@@ -201,6 +203,7 @@ class QnnBackendManager {
   std::set<HMODULE> mod_handles_;
 #endif
   const std::string qnn_saver_path_;
+  uint32_t htp_power_config_client_id_ = 0;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index 8649db92be027..6080c63b555a8 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -48,6 +48,14 @@ enum class HtpPerformanceMode : uint8_t {
   kHtpBalanced,
 };
 
+// Defines the graph optimization strategy used by the HTP backend.
+enum class HtpGraphFinalizationOptimizationMode : uint8_t {
+  kDefault = 0,
+  kMode1 = 1,  // Faster preparation time, less optimal graph
+  kMode2 = 2,  // Longer preparation time, more optimal graph
+  kMode3 = 3,  // Longest preparation time, most likely even more optimal graph.
+};
+
 enum class QnnBackendType : uint8_t {
   CPU = 0,
   GPU,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
new file mode 100644
index 0000000000000..63aa01b48e7e2
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
+
+#include "HTP/QnnHtpGraph.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+const QnnGraph_Config_t** QnnGraphConfigsBuilder::GetQnnGraphConfigs() {
+  if (graph_config_ptrs_.empty()) {
+    return nullptr;
+  }
+
+  if (!IsNullTerminated()) {
+    graph_config_ptrs_.push_back(nullptr);
+  }
+
+  return graph_config_ptrs_.data();
+}
+
+QnnHtpGraph_CustomConfig_t& QnnGraphConfigsBuilder::PushHtpGraphCustomConfig() {
+  htp_custom_graph_configs_.push_back(QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
+  return htp_custom_graph_configs_.back();
+}
+
+QnnGraph_Config_t& QnnGraphConfigsBuilder::PushGraphConfig() {
+  graph_configs_.push_back(QNN_GRAPH_CONFIG_INIT);
+  QnnGraph_Config_t& config = graph_configs_.back();
+
+  // Add pointer to this new graph config to the list of graph config pointers.
+  if (IsNullTerminated()) {
+    graph_config_ptrs_.back() = &config;  // Replace last nullptr entry.
+  } else {
+    graph_config_ptrs_.push_back(&config);
+  }
+
+  return config;
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
new file mode 100644
index 0000000000000..8c4928fdacbc4
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
@@ -0,0 +1,56 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <core/common/inlined_containers_fwd.h>
+
+#include "HTP/QnnHtpGraph.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+/**
+ * Helper class for building a null-terminated list of QNN Graph configurations.
+ * A QNN configuration consists of multiple objects with references to each other. This
+ * class ensures that all configuration objects have the same lifetime, so that they remain valid
+ * across the call to graphCreate().
+ */
+class QnnGraphConfigsBuilder {
+ public:
+  /**
+   * Returns a pointer to the beginning of a null-terminated array of QNN Graph configurations.
+   * This result is passed QNN's graphCreate() API.
+   *
+   * \return Pointer to null-terminated QnnGraph_Config_t* array.
+   */
+  const QnnGraph_Config_t** GetQnnGraphConfigs();
+
+  /**
+   * Creates and returns a reference to a new HTP graph configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default QnnHtpGraph_CustomConfig_t object.
+   */
+  QnnHtpGraph_CustomConfig_t& PushHtpGraphCustomConfig();
+
+  /**
+   * Creates and returns a reference to a new graph configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default QnnGraph_Config_t object.
+   */
+  QnnGraph_Config_t& PushGraphConfig();
+
+ private:
+  bool IsNullTerminated() const {
+    return !graph_config_ptrs_.empty() && graph_config_ptrs_.back() == nullptr;
+  }
+
+  InlinedVector<QnnHtpGraph_CustomConfig_t> htp_custom_graph_configs_;
+  InlinedVector<QnnGraph_Config_t> graph_configs_;
+  InlinedVector<const QnnGraph_Config_t*> graph_config_ptrs_;
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 0a458f2602b89..fd3a95b5f1f78 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -87,7 +87,8 @@ const NodeUnit& QnnModel::GetNodeUnit(const Node* node,
 }
 
 Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
-                              const onnxruntime::Node& fused_node) {
+                              const onnxruntime::Node& fused_node,
+                              const QnnGraph_Config_t** graph_configs) {
   LOGS(logger_, VERBOSE) << "ComposeGraph Graph name: " << graph_viewer.Name();
 
   // Holder for the NodeUnits in the graph, this will guarantee the NodeUnits is
@@ -107,7 +108,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
                                                       initializer_inputs_,
                                                       qnn_backend_manager_->GetQnnBackendType());
   bool rt = true;
-  rt = qnn_model_wrapper.CreateQnnGraph(qnn_backend_manager_->GetQnnContext(), graph_name);
+  rt = qnn_model_wrapper.CreateQnnGraph(qnn_backend_manager_->GetQnnContext(), graph_name, graph_configs);
   if (!rt) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to initialize qnn_model_wrapper.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 373995106f31c..de4f872f73ccf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -27,7 +27,8 @@ class QnnModel {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnModel);
 
   Status ComposeGraph(const GraphViewer& graph_viewer,
-                      const onnxruntime::Node& fused_node);
+                      const onnxruntime::Node& fused_node,
+                      const QnnGraph_Config_t** graph_configs = nullptr);
 
   Status FinalizeGraphs();
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index d3aafcbecd322..6cb276378a09c 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -76,6 +76,24 @@ void QNNExecutionProvider::ParseHtpPerformanceMode(std::string htp_performance_m
   }
 }
 
+void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string) {
+  LOGS_DEFAULT(VERBOSE) << "HTP graph finalization optimization mode: "
+                        << htp_graph_finalization_opt_mode_string;
+
+  if (htp_graph_finalization_opt_mode_string.empty() || htp_graph_finalization_opt_mode_string == "0") {
+    htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
+  } else if (htp_graph_finalization_opt_mode_string == "1") {
+    htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode1;
+  } else if (htp_graph_finalization_opt_mode_string == "2") {
+    htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode2;
+  } else if (htp_graph_finalization_opt_mode_string == "3") {
+    htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode3;
+  } else {
+    LOGS_DEFAULT(WARNING) << "Invalid HTP graph finalization optimization mode: "
+                          << htp_graph_finalization_opt_mode_string;
+  }
+}
+
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const SessionOptions* session_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true},
@@ -140,6 +158,13 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     ParseHtpPerformanceMode(htp_performance_mode_pos->second);
   }
 
+  htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
+  static const std::string HTP_GRAPH_FINALIZATION_OPT_MODE = "htp_graph_finalization_optimization_mode";
+  auto htp_graph_finalization_opt_mode_pos = runtime_options_.find(HTP_GRAPH_FINALIZATION_OPT_MODE);
+  if (htp_graph_finalization_opt_mode_pos != runtime_options_.end()) {
+    ParseHtpGraphFinalizationOptimizationMode(htp_graph_finalization_opt_mode_pos->second);
+  }
+
   // Enable use of QNN Saver if the user provides a path the QNN Saver backend library.
   static const std::string QNN_SAVER_PATH_KEY = "qnn_saver_path";
   std::string qnn_saver_path;
@@ -448,6 +473,20 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
   return Status::OK();
 }
 
+void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_builder) const {
+  if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP &&
+      htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
+    QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushHtpGraphCustomConfig();
+    htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+    htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+    htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
+
+    QnnGraph_Config_t& graph_opt_config = configs_builder.PushGraphConfig();
+    graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_opt_config.customConfig = &htp_graph_opt_config;
+  }
+}
+
 Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                                  std::vector<NodeComputeInfo>& node_compute_funcs,
                                                  const logging::Logger& logger) {
@@ -458,7 +497,10 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
     std::unique_ptr<qnn::QnnModel> qnn_model = std::make_unique<qnn::QnnModel>(logger,
                                                                                qnn_backend_manager_.get());
 
-    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node));
+    qnn::QnnGraphConfigsBuilder graph_configs_builder;
+    InitQnnGraphConfigs(graph_configs_builder);
+
+    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnGraphConfigs()));
     ORT_RETURN_IF_ERROR(qnn_model->FinalizeGraphs());
     ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput());
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index c63a60018aca8..a01b828531555 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -9,6 +9,7 @@
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
 
 namespace onnxruntime {
 
@@ -57,10 +58,15 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   void ParseHtpPerformanceMode(std::string htp_performance_mode_string);
 
+  void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string);
+
+  void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const;
+
  private:
   ProviderOptions runtime_options_;
   qnn::ProfilingLevel profiling_level_ = qnn::ProfilingLevel::OFF;
   qnn::HtpPerformanceMode htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
+  qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
   std::unique_ptr<qnn::QnnBackendManager> qnn_backend_manager_;
   std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>> qnn_models_;
   uint32_t rpc_control_latency_ = 0;
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 0526ccca5bb4e..98646058eec3d 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -59,6 +59,8 @@ void usage() {
       "\t    [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
+      "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
+      "\t    '0', '1', '2', '3', default is '0'.\n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -488,9 +490,19 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
           }
         } else if (key == "qnn_saver_path") {
           // no validation
+        } else if (key == "htp_graph_finalization_optimization_mode") {
+          std::unordered_set<std::string> supported_htp_graph_final_opt_modes = {"0", "1", "2", "3"};
+          if (supported_htp_graph_final_opt_modes.find(value) == supported_htp_graph_final_opt_modes.end()) {
+            std::ostringstream str_stream;
+            std::copy(supported_htp_graph_final_opt_modes.begin(), supported_htp_graph_final_opt_modes.end(),
+                      std::ostream_iterator<std::string>(str_stream, ","));
+            std::string str = str_stream.str();
+            ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
+          }
         } else {
           ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
-'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode'])");
+'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path',
+'htp_graph_finalization_optimization_mode'])");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 6d075fec997b5..72472e5798792 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -71,6 +71,9 @@ namespace perftest {
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
       "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
+      "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
+      "\t    '0', '1', '2', '3', default is '0'.\n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
       "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
       "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index b7a111783fc94..f3ea188043dbe 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -356,9 +356,21 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           std::string str = str_stream.str();
           ORT_THROW("Supported htp_performance_mode: " + str);
         }
+      } else if (key == "qnn_saver_path") {
+        // no validation
+      } else if (key == "htp_graph_finalization_optimization_mode") {
+        std::unordered_set<std::string> supported_htp_graph_final_opt_modes = {"0", "1", "2", "3"};
+        if (supported_htp_graph_final_opt_modes.find(value) == supported_htp_graph_final_opt_modes.end()) {
+          std::ostringstream str_stream;
+          std::copy(supported_htp_graph_final_opt_modes.begin(), supported_htp_graph_final_opt_modes.end(),
+                    std::ostream_iterator<std::string>(str_stream, ","));
+          std::string str = str_stream.str();
+          ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
+        }
       } else {
         ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
-'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode'])");
+'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path',
+'htp_graph_finalization_optimization_mode'])");
       }
 
       qnn_options[key] = value;
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 5f63813d8d84e..02ff834169b2b 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -173,7 +173,8 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
 // The models passed to this function are subgraphs extracted from a larger model that exhibited
 // shape inferencing issues on QNN. Thus, the models are expected to have a specific input/output
 // types and shapes.
-static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false) {
+static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false,
+                               std::string htp_graph_finalization_opt_mode = "") {
   Ort::SessionOptions so;
 
   // Ensure all type/shape inference warnings result in errors!
@@ -194,6 +195,10 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo
   }
 #endif
 
+  if (!htp_graph_finalization_opt_mode.empty()) {
+    options["htp_graph_finalization_optimization_mode"] = std::move(htp_graph_finalization_opt_mode);
+  }
+
   so.AppendExecutionProvider("QNN", options);
 
   Ort::Session session(*ort_env, ort_model_path, so);
@@ -302,6 +307,21 @@ TEST_F(QnnHTPBackendTests, QnnSaver_OutputFiles) {
   EXPECT_TRUE(std::filesystem::exists(qnn_saver_output_dir / "params.bin"));
 }
 
+// Test that models run with various HTP graph finalization optimization modes.
+TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
+  constexpr std::array<const char*, 5> graph_opt_modes = {"",    // No explicit mode specified
+                                                          "0",   // Explicit default mode
+                                                          "1",   // Mode 1
+                                                          "2",   // Mode 2
+                                                          "3"};  // Mode 3
+  for (auto mode : graph_opt_modes) {
+    RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                       true,   // use_htp
+                       false,  // enable_qnn_saver
+                       mode);  // htp_graph_finalization_opt_mode
+  }
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
 

From a37e6a503b3073605b1eb765d1b0ff5a61360991 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 8 Nov 2023 11:19:45 -0800
Subject: [PATCH 06/13] Update Abseil raw_flat_hash visualization (#18329)

### Description
<!-- Describe your changes. -->
Fix the broken pieces due to the latest   Abseil update.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
Make the debugging bearable.
---
 cmake/external/abseil-cpp.natvis | 50 ++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index e923d5862ec2e..708d6ba18750b 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -25,30 +25,56 @@
   </Type>
   <!-- Should handle both flat hash_set and hash_map -->
   <Type Name="absl::lts_20230802::container_internal::raw_hash_set&lt;*&gt;">
-    <DisplayString Condition="size_ == 0">empty</DisplayString>
-    <DisplayString>{{ size={size_} }}</DisplayString>
+    <Intrinsic Name="_commonfields" Expression="settings_.value"/>
+    <Intrinsic Name="_size" Expression="settings_.value.compressed_tuple_.value"/>
+    <Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
+    <Intrinsic Name="_control" Expression="_commonfields().control_"/>
+    <Intrinsic Name="_slots" Expression="(slot_type*)(_commonfields().slots_)"/>
+    <DisplayString Condition="_size() == 0">empty</DisplayString>
+    <DisplayString IncludeView="noparens">size={ _size() }</DisplayString>
+    <DisplayString ExcludeView="noparens">size=({_size()})</DisplayString>
     <Expand>
-      <Item Name="[size]" ExcludeView="simple">size_</Item>
-      <Item Name="[capacity]" ExcludeView="simple">capacity_</Item>
-      <CustomListItems MaxItemsPerView="5000">
+      <Item Name="[Size]">_size()</Item>
+      <Item Name="[Capacity]" ExcludeView="noparens">_capacity()</Item>
+      <CustomListItems MaxItemsPerView="100">
         <Variable Name="nslot" InitialValue="0" />
-        <Size>size_</Size>
+        <Size>_size()</Size>
         <Loop>
           <!-- bool IsFull(ctrl_t c) const { return c >= 0; } -->
-          <If Condition="ctrl_[nslot] &gt;= 0">
-            <Item>slots_[nslot]</Item>
+          <If Condition="_control()[nslot] &gt;= 0">
+            <Item>_slots()[nslot]</Item>
           </If>
           <Exec>nslot++</Exec>
-          <Break Condition="nslot == capacity_" />
+          <Break Condition="nslot == _capacity()" />
         </Loop>
       </CustomListItems>
     </Expand>
   </Type>
+
+  <!-- Primitive types stored as a value -->
+  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,0&gt;">
+    <DisplayString IncludeView="noparens">*($T1 *){value}</DisplayString>
+    <DisplayString ExcludeView="noparens">(*($T1 *){value})</DisplayString>
+    <Expand>
+      <ExpandedItem>*($T1 *){value}</ExpandedItem>
+    </Expand>
+  </Type>
+
+  <!-- For storage inherited from the type -->
+  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,1&gt;">
+    <DisplayString IncludeView="noparens">*($T1 *)this</DisplayString>
+    <DisplayString ExcludeView="noparens">(*($T1 *)this)</DisplayString>
+    <Expand>
+      <ExpandedItem>*($T1 *)this</ExpandedItem>
+    </Expand>
+  </Type>
+
   <Type Name="absl::lts_20230802::container_internal::map_slot_type&lt;*&gt;">
-    <DisplayString>{{ {value.first}:{value.second} }}</DisplayString>
+    <DisplayString IncludeView="noparens">{value.first}, {value.second}</DisplayString>
+    <DisplayString ExcludeView="noparens">({value.first}, {value.second})</DisplayString>
     <Expand>
-      <Item Name="[key]" ExcludeView="simple">value.first</Item>
-      <Item Name="[value]" ExcludeView="simple">value.second</Item>
+      <Item Name="first" ExcludeView="simple">value.first</Item>
+      <Item Name="second" ExcludeView="simple">value.second</Item>
     </Expand>
   </Type>
 </AutoVisualizer>

From 68fab24c22e1453ebfcd9d67e857c6531985faa0 Mon Sep 17 00:00:00 2001
From: sophies927 <107952697+sophies927@users.noreply.github.com>
Date: Wed, 8 Nov 2023 11:56:35 -0800
Subject: [PATCH 07/13] Update stale.yml (#18304)

Exempt all issues w/ assignees from stale bot, increase days before
issue close, + add start date to address issue w/ GH API rate limiting

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .github/workflows/stale.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 67d8550d44204..e49e0921af434 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -17,10 +17,14 @@ jobs:
         with:
           # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: contributions welcome, feature request, regression
+          # Override exempt-all-assignees but only to exempt the issues with an assignee to be marked as stale automatically
+          exempt-all-issue-assignees: true
+          # Used to ignore the issues and pull requests created before the start date
+          start-date: 20220419
           # Number of days without activity before the actions/stale action labels an issue
           days-before-issue-stale: 30
           # Number of days without activity before the actions/stale action closes an issue
-          days-before-issue-close: 7
+          days-before-issue-close: 30
           # Label you want to apply to issues that have been inactive for the amount of time specified by days-before-issue-stale
           stale-issue-label: "stale"
           # Comment that you want to add to issues that are labeled by the actions/stale action

From 812532592e070823f0362af8c3219ca6b01eda77 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 8 Nov 2023 13:03:18 -0800
Subject: [PATCH 08/13] Add a build validation for Linux ARM64 cross-compile
 (#18200)

### Description
1. Add a build validation for Linux ARM64/ARM32 cross-compile to catch
issues listed in #18195 .
2. Revert eigen's commit id back to what we had before.


### Motivation and Context
To catch cross-compile issues.
Added a TODO item for fixing the compile warnings in Linux ARM32 build: AB#21639
---
 cgmanifests/cgmanifest.json                   |  2 +-
 cgmanifests/generated/cgmanifest.json         |  4 +-
 cmake/deps.txt                                |  9 ++-
 .../linux_arm32_crosscompile_toolchain.cmake  |  9 +++
 .../linux_arm64_crosscompile_toolchain.cmake  |  9 +++
 onnxruntime/core/common/threadpool.cc         | 12 ++--
 onnxruntime/core/platform/windows/env.cc      |  2 +-
 .../azure-pipelines/linux-ci-pipeline.yml     | 58 ++++++++++++++++++-
 .../templates/download-deps.yml               |  4 +-
 tools/ci_build/replace_urls_in_deps.py        | 12 ++++
 10 files changed, 104 insertions(+), 17 deletions(-)
 create mode 100644 cmake/linux_arm32_crosscompile_toolchain.cmake
 create mode 100644 cmake/linux_arm64_crosscompile_toolchain.cmake

diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 2a3de3bb0ee51..e8dbc9cf9eff6 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -568,7 +568,7 @@
          "component": {
             "type": "git",
             "git": {
-               "commitHash": "d10b27fe37736d2944630ecd7557cefa95cf87c9",
+               "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
                "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
             }            
          }
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 6b0e3659bd234..0886a29fa573e 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -136,7 +136,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "003c580e696a774afdc984996ee909b7c8d8128c",
+          "commitHash": "0da379fc4808f9601faef392352018c741c0f297",
           "repositoryUrl": "https://github.com/google/XNNPACK.git"
         },
         "comments": "googlexnnpack"
@@ -226,7 +226,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "1787867f6183f056420e532eec640cba25efafea",
+          "commitHash": "4fe0e1e183925bf8cfa6aae24237e724a96479b8",
           "repositoryUrl": "https://github.com/Maratyszcza/pthreadpool.git"
         },
         "comments": "pthreadpool"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 4aab8c974d62f..275b5eaf6b976 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -9,14 +9,19 @@
 #since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would
 #not affect built binaries.
 #
-# NOTE: You must run deps_update_and_upload.py when ready to test your changes in a CI.
+# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
 # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
 #
 abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.0.zip;04271dfbfac59269b6939e1e9d5faf0d18a7ba91
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
 dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
-eigen;https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip;ef24286b7ece8737c99fa831b02941843546c081
+# This Eigen commit id matches the eigen archive being consumed from https://gitlab.com/libeigen/eigen/-/archive/3.4/eigen-3.4.zip
+# prior to the 3.4.1 RC changing the bits and invalidating the hash.
+# it contains changes on top of 3.4.0 which are required to fix build issues.
+# Until the 3.4.1 release this is the best option we have.
+# Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
+eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
 flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
diff --git a/cmake/linux_arm32_crosscompile_toolchain.cmake b/cmake/linux_arm32_crosscompile_toolchain.cmake
new file mode 100644
index 0000000000000..0183262a8875e
--- /dev/null
+++ b/cmake/linux_arm32_crosscompile_toolchain.cmake
@@ -0,0 +1,9 @@
+ #This file is just a sample. You may need to modify it before using.
+ SET(CMAKE_SYSTEM_NAME Linux)
+ SET(CMAKE_SYSTEM_VERSION 1)
+ SET(CMAKE_C_COMPILER arm-none-linux-gnueabihf-gcc)
+ SET(CMAKE_CXX_COMPILER arm-none-linux-gnueabihf-g++)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
\ No newline at end of file
diff --git a/cmake/linux_arm64_crosscompile_toolchain.cmake b/cmake/linux_arm64_crosscompile_toolchain.cmake
new file mode 100644
index 0000000000000..1a492bbc269e7
--- /dev/null
+++ b/cmake/linux_arm64_crosscompile_toolchain.cmake
@@ -0,0 +1,9 @@
+ #This file is just a sample. You may need to modify it before using.
+ SET(CMAKE_SYSTEM_NAME Linux)
+ SET(CMAKE_SYSTEM_VERSION 1)
+ SET(CMAKE_C_COMPILER aarch64-none-linux-gnu-gcc)
+ SET(CMAKE_CXX_COMPILER aarch64-none-linux-gnu-g++)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+ SET(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
\ No newline at end of file
diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
index f29ab19608934..10e117267e14b 100644
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@@ -562,7 +562,7 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso
   constexpr ptrdiff_t max_oversharding_factor = 4;
   ptrdiff_t block_size = Eigen::numext::mini(
       n,
-      Eigen::numext::maxi<ptrdiff_t>(Eigen::divup<ptrdiff_t>(n, max_oversharding_factor * num_threads), static_cast<ptrdiff_t>(block_size_f)));
+      Eigen::numext::maxi<ptrdiff_t>(Eigen::numext::div_ceil<ptrdiff_t>(n, max_oversharding_factor * num_threads), static_cast<ptrdiff_t>(block_size_f)));
   const ptrdiff_t max_block_size = Eigen::numext::mini(n, 2 * block_size);
 
   if (block_align) {
@@ -571,19 +571,19 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso
     block_size = Eigen::numext::mini(n, new_block_size);
   }
 
-  ptrdiff_t block_count = Eigen::divup(n, block_size);
+  ptrdiff_t block_count = Eigen::numext::div_ceil(n, block_size);
 
   // Calculate parallel efficiency as fraction of total CPU time used for
   // computations:
   double max_efficiency =
-      static_cast<double>(block_count) / (Eigen::divup<ptrdiff_t>(block_count, num_threads) * num_threads);
+      static_cast<double>(block_count) / (Eigen::numext::div_ceil<ptrdiff_t>(block_count, num_threads) * num_threads);
 
   // Now try to increase block size up to max_block_size as long as it
   // doesn't decrease parallel efficiency.
   for (ptrdiff_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
     // This is the next block size that divides size into a smaller number
     // of blocks than the current block_size.
-    ptrdiff_t coarser_block_size = Eigen::divup(n, prev_block_count - 1);
+    ptrdiff_t coarser_block_size = Eigen::numext::div_ceil(n, prev_block_count - 1);
     if (block_align) {
       ptrdiff_t new_block_size = block_align(coarser_block_size);
       assert(new_block_size >= coarser_block_size);
@@ -593,11 +593,11 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso
       break;  // Reached max block size. Stop.
     }
     // Recalculate parallel efficiency.
-    const ptrdiff_t coarser_block_count = Eigen::divup(n, coarser_block_size);
+    const ptrdiff_t coarser_block_count = Eigen::numext::div_ceil(n, coarser_block_size);
     assert(coarser_block_count < prev_block_count);
     prev_block_count = coarser_block_count;
     const double coarser_efficiency =
-        static_cast<double>(coarser_block_count) / (Eigen::divup<ptrdiff_t>(coarser_block_count, num_threads) * num_threads);
+        static_cast<double>(coarser_block_count) / (Eigen::numext::div_ceil<ptrdiff_t>(coarser_block_count, num_threads) * num_threads);
     if (coarser_efficiency + 0.01 >= max_efficiency) {
       // Taking it.
       block_size = coarser_block_size;
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index f02c61daabeed..45648010baf86 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
-#include "unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h"
+#include <unsupported/Eigen/CXX11/ThreadPool>
 #include <wil/Resource.h>
 
 #include "core/platform/path_lib.h"  // for LoopDir()
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 395c190ce9e11..f46febee178e1 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -56,10 +56,23 @@ stages:
         clean: true
         submodules: none
 
-      - task: UsePythonVersion@0
+      - task: DownloadPackage@1
+        displayName: 'Download ARM64 GCC'
         inputs:
-          versionSpec: '3.8'
-          addToPath: true
+          packageType: upack
+          feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+          definition: 'gcc_aarch64_linux_gnu_host_x86_64'
+          version: 13.2.1
+          downloadPath: $(Build.BinariesDirectory)/gcc
+
+      - task: DownloadPackage@1
+        displayName: 'Download ARM32 GCC'
+        inputs:
+          packageType: upack
+          feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+          definition: 'gcc_aarch32_linux_gnu_host_x86_64'
+          version: 13.2.1
+          downloadPath: $(Build.BinariesDirectory)/gcc
 
       - template: templates/get-docker-image-steps.yml
         parameters:
@@ -68,6 +81,45 @@ stages:
           DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
           Repository: onnxruntimecpubuild
 
+      - task: PythonScript@0
+        displayName: 'Update deps.txt'
+        inputs:
+          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+          arguments: --new_dir $(Build.BinariesDirectory)/deps
+          workingDirectory: $(Build.BinariesDirectory)
+          pythonInterpreter: /usr/bin/python3
+
+      - script: |
+          set -e -x
+          # ARM64 build
+          mkdir -p $(Build.BinariesDirectory)/gccbin
+          tar -Jxf $(Build.BinariesDirectory)/gcc/arm-gnu-toolchain-13.2.rel1-x86_64-aarch64-none-linux-gnu.tar.xz --strip=1 -C $(Build.BinariesDirectory)/gccbin
+          export PATH=$(Build.BinariesDirectory)/gccbin/bin:$PATH
+          mkdir $(Build.BinariesDirectory)/aarch64build
+          cd $(Build.BinariesDirectory)/aarch64build
+          cmake $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm64_crosscompile_toolchain.cmake -G Ninja
+          ninja
+          rm -rf $(Build.BinariesDirectory)/aarch64build $(Build.BinariesDirectory)/gccbin
+          # ARM32 build
+          mkdir -p $(Build.BinariesDirectory)/gccbin
+          tar -Jxf $(Build.BinariesDirectory)/gcc/arm-gnu-toolchain-13.2.rel1-x86_64-arm-none-linux-gnueabihf.tar.xz --strip=1 -C $(Build.BinariesDirectory)/gccbin
+          ls $(Build.BinariesDirectory)/gccbin/bin
+          mkdir $(Build.BinariesDirectory)/arm32build
+          cd $(Build.BinariesDirectory)/arm32build
+          # TODO: fix the warnings and remove the --compile-no-warning-as-error arg
+          cmake --compile-no-warning-as-error $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm32_crosscompile_toolchain.cmake -G Ninja
+          ninja
+          rm -rf $(Build.BinariesDirectory)/arm32build $(Build.BinariesDirectory)/gccbin
+        displayName: Cross-compile for Linux ARM32 and ARM64
+
+      - task: PythonScript@0
+        displayName: 'Update deps.txt'
+        inputs:
+          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+          arguments: --new_dir /build/deps
+          workingDirectory: $(Build.BinariesDirectory)
+          pythonInterpreter: /usr/bin/python3
+
       - template: templates/linux-build-step-with-cache.yml
         parameters:
           WithCache: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index bb9f1fd277e2d..a78f743c15347 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.110
+      version: 1.0.114
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.110
+      version: 1.0.114
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py
index 28e3c91107c6c..ac4f515d5482b 100644
--- a/tools/ci_build/replace_urls_in_deps.py
+++ b/tools/ci_build/replace_urls_in_deps.py
@@ -8,6 +8,7 @@
 import argparse
 import csv
 import os
+import shutil
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -46,7 +47,16 @@ def main():
     deps = []
 
     csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt"
+    backup_csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt.bak"
+    # prefer to use the backup file
+    if backup_csv_file_path.exists():
+        csv_file_path = backup_csv_file_path
+    else:
+        # Make a copy before modifying it
+        print("Making a copy to %s" % str(backup_csv_file_path))
+        shutil.copy(csv_file_path, backup_csv_file_path)
 
+    print("Reading from %s" % str(csv_file_path))
     # Read the whole file into memory first
     with csv_file_path.open("r", encoding="utf-8") as f:
         depfile_reader = csv.reader(f, delimiter=";")
@@ -58,6 +68,8 @@ def main():
                 continue
             deps.append(Dep(row[0], row[1], row[2]))
 
+    csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt"
+    print("Writing to %s" % str(csv_file_path))
     # Write updated content back
     with csv_file_path.open("w", newline="", encoding="utf-8") as f:
         depfile_writer = csv.writer(f, delimiter=";")

From c250540722a8028250de1d572365ec73ef6ecedf Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 8 Nov 2023 13:04:40 -0800
Subject: [PATCH 09/13] Bump linter versions (#18341)

Bump linter versions and run format.
---
 .../core/platform/EigenNonBlockingThreadPool.h    |  2 +-
 onnxruntime/python/backend/backend.py             |  4 ++--
 .../models/stable_diffusion/diffusion_models.py   |  2 +-
 .../providers/cpu/math/element_wise_ops_test.cc   |  4 ++--
 .../providers/cpu/reduction/reduction_ops_test.cc |  4 ++--
 .../test/providers/nnapi/nnapi_basic_test.cc      |  9 +++++----
 .../python/training/ortmodule/_fallback.py        |  4 ++--
 requirements-lintrunner.txt                       |  8 ++++----
 tools/ci_build/build.py                           | 10 ++++------
 ...nd_optimizer_opset_version_updates_required.py |  2 +-
 winml/lib/Api.Image/inc/ImageConversionHelpers.h  |  3 ++-
 .../Api.Image/inc/TensorToVideoFrameConverter.h   |  5 +----
 .../Api.Image/inc/VideoFrameToTensorConverter.h   |  5 +----
 winml/lib/Api.Ort/OnnxruntimeModel.cpp            |  6 ++++--
 winml/lib/Api/impl/TensorBase.h                   |  3 ++-
 winml/test/adapter/AdapterDmlEpTest.cpp           |  6 ++++--
 winml/test/adapter/AdapterSessionTest.cpp         |  3 ++-
 winml/test/api/LearningModelAPITest.cpp           |  9 ++++++---
 winml/test/api/LearningModelBindingAPITest.cpp    |  3 ++-
 winml/test/api/LearningModelSessionAPITest.cpp    |  9 ++++++---
 winml/test/api/RawApiHelpers.cpp                  |  3 ++-
 winml/test/api/RawApiTestsGpu.cpp                 |  3 ++-
 winml/test/concurrency/ConcurrencyTests.cpp       | 12 ++++++++----
 winml/test/image/imageTestHelper.cpp              |  3 ++-
 winml/test/image/imagetests.cpp                   |  3 ++-
 winml/test/model/model_tests.cpp                  |  3 ++-
 winml/test/model/skip_model_tests.h               | 10 ++++------
 winml/test/scenario/cppwinrt/CustomNullOp.h       |  3 ++-
 winml/test/scenario/cppwinrt/CustomOps.cpp        | 15 ++++++++-------
 winml/test/scenario/cppwinrt/NoisyReluCpu.h       | 12 ++++++++----
 winml/test/scenario/cppwinrt/ReluCpu.h            |  3 ++-
 .../scenario/cppwinrt/scenariotestscppwinrt.cpp   |  9 ++++++---
 32 files changed, 102 insertions(+), 78 deletions(-)

diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index a57385f6e23f1..f9b694efb936f 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -278,7 +278,7 @@ class ThreadPoolProfiler {
   int num_threads_;
 #ifdef _MSC_VER
 #pragma warning(push)
-// C4324: structure was padded due to alignment specifier
+  // C4324: structure was padded due to alignment specifier
 #pragma warning(disable : 4324)
 #endif  // _MSC_VER
   struct ORT_ALIGN_TO_AVOID_FALSE_SHARING ChildThreadStat {
diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py
index 1edae383e93e6..97b7358f2a223 100644
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@@ -63,7 +63,7 @@ def is_opset_supported(cls, model):
                         error_message = (
                             "Skipping this test as only released onnx opsets are supported."
                             "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
-                            " Got Domain '{}' version '{}'.".format(domain, opset.version)
+                            f" Got Domain '{domain}' version '{opset.version}'."
                         )
                         return False, error_message
                 except AttributeError:
@@ -74,7 +74,7 @@ def is_opset_supported(cls, model):
                         error_message = (
                             "Skipping this test as only released onnx opsets are supported."
                             "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
-                            " Got Domain '{}' version '{}'.".format(domain, opset.version)
+                            f" Got Domain '{domain}' version '{opset.version}'."
                         )
                         return False, error_message
         return True, ""
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
index d93ca8dba7fa0..3dcde7f6db398 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -463,7 +463,7 @@ def add_hidden_states_graph_output(self, model: ModelProto, optimized_onnx_path,
 
         assert self.clip_skip >= 0 and self.clip_skip < hidden_layers
 
-        node_output_name = "/text_model/encoder/layers.{}/Add_1_output_0".format(hidden_layers - 1 - self.clip_skip)
+        node_output_name = f"/text_model/encoder/layers.{hidden_layers - 1 - self.clip_skip}/Add_1_output_0"
 
         # search the name in outputs of all node
         found = False
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 257ce977700a6..5e746ed0c62d4 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -1238,7 +1238,7 @@ TEST(MathOpTest, Sum_8_Test1) {
   // This test runs fine on CPU Plugin
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                    // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum
 #endif
 }
 
@@ -1264,7 +1264,7 @@ TEST(MathOpTest, Sum_8_Test1_double) {
   // This test runs fine on CPU Plugin
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                    // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum
 #endif
 }
 TEST(MathOpTest, Sum_8_Test2) {
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index c9b851e450f9d..79da8004a9edd 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1086,7 +1086,7 @@ TEST(ReductionOpTest, ReduceMax_int32) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          // TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: axis must be 0
 #endif
 }
 
@@ -1107,7 +1107,7 @@ TEST(ReductionOpTest, ReduceMax_int64) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          // TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: axis must be 0
 #endif
 }
 
diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
index 0e783a94c5479..b3e1025e7367c 100644
--- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
+++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@@ -556,10 +556,11 @@ TEST(NnapiExecutionProviderTest, ActivationOutsideOfPartition) {
   constexpr auto* model_file_name = ORT_TSTR("testdata/mnist.basic.ort");
   // stop NNAPI partitioning at Relu so NNAPI EP only takes first Conv
   const auto nnapi_partitioning_stop_ops = "Relu";
-  TestModelLoad(model_file_name, std::make_unique<NnapiExecutionProvider>(0, nnapi_partitioning_stop_ops),
-                // expect one NNAPI partition
-                [](const Graph& graph) { ASSERT_EQ(CountAssignedNodes(graph, kNnapiExecutionProvider), 1)
-                                             << "Exactly one node should have been taken by the NNAPI EP"; });
+  TestModelLoad(
+      model_file_name, std::make_unique<NnapiExecutionProvider>(0, nnapi_partitioning_stop_ops),
+      // expect one NNAPI partition
+      [](const Graph& graph) { ASSERT_EQ(CountAssignedNodes(graph, kNnapiExecutionProvider), 1)
+                                   << "Exactly one node should have been taken by the NNAPI EP"; });
 }
 
 }  // namespace test
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py
index 44f96dcff7fb0..56bb45d064d8a 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py
@@ -175,9 +175,9 @@ def fallback(self, log_level: _logger.LogLevel, *inputs, **kwargs):
 
             # This warning will not be raised again if retry is not enabled
             self._logger.warning(
-                "Fallback to PyTorch due to exception {} was triggered. "
+                f"Fallback to PyTorch due to exception {exception_type} was triggered. "
                 "Report this issue with a minimal repro at https://www.github.com/microsoft/onnxruntime. "
-                "See details below:\n\n{}".format(exception_type, exception_string)
+                f"See details below:\n\n{exception_string}"
             )
 
             self._raised_fallback_exception = True
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 42c016984bc0f..25454ce40c263 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
-lintrunner-adapters>=0.8.0
+lintrunner-adapters>=0.11.0
 # RUFF
-ruff==0.0.292
+ruff==0.1.4
 # BLACK-ISORT
-black==23.7.0
+black==23.10.1
 isort==5.12.0
 # CLANGFORMAT
-clang-format==16.0.6
+clang-format==17.0.4
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index a38087aa0313a..e0559419ef8c7 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1637,9 +1637,7 @@ def run_adb_shell(cmd):
         # GCOV_PREFIX specifies the root directory
         # for creating the runtime code coverage files.
         if args.code_coverage:
-            adb_shell(
-                "cd {0} && GCOV_PREFIX={0} GCOV_PREFIX_STRIP={1} {2}".format(device_dir, cwd.count(os.sep) + 1, cmd)
-            )
+            adb_shell(f"cd {device_dir} && GCOV_PREFIX={device_dir} GCOV_PREFIX_STRIP={cwd.count(os.sep) + 1} {cmd}")
         else:
             adb_shell(f"cd {device_dir} && {cmd}")
 
@@ -1689,7 +1687,7 @@ def run_adb_shell(cmd):
                 )
 
             if args.use_nnapi:
-                run_adb_shell("{0}/onnx_test_runner -e nnapi {0}/test".format(device_dir))
+                run_adb_shell(f"{device_dir}/onnx_test_runner -e nnapi {device_dir}/test")
             else:
                 run_adb_shell(f"{device_dir}/onnx_test_runner {device_dir}/test")
 
@@ -1702,9 +1700,9 @@ def run_adb_shell(cmd):
                 adb_push("onnxruntime_customopregistration_test", device_dir, cwd=cwd)
                 adb_shell(f"chmod +x {device_dir}/onnxruntime_shared_lib_test")
                 adb_shell(f"chmod +x {device_dir}/onnxruntime_customopregistration_test")
-                run_adb_shell("LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_shared_lib_test".format(device_dir))
+                run_adb_shell(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_shared_lib_test")
                 run_adb_shell(
-                    "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_customopregistration_test".format(device_dir)
+                    f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_customopregistration_test"
                 )
 
 
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index 0076d27fe950e..8a5e57b51e38d 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -54,7 +54,7 @@ def get_call_args_from_file(filename: str, function_or_declaration: str) -> typi
                     # TODO: handle automatically by merging lines
                     log.error(
                         "Call/Declaration is split over multiple lines. Please check manually."
-                        "File:{} Line:{}".format(filename, line_num)
+                        f"File:{filename} Line:{line_num}"
                     )
                     continue
 
diff --git a/winml/lib/Api.Image/inc/ImageConversionHelpers.h b/winml/lib/Api.Image/inc/ImageConversionHelpers.h
index 8e3dca2ae11e8..5a9c8f21255b5 100644
--- a/winml/lib/Api.Image/inc/ImageConversionHelpers.h
+++ b/winml/lib/Api.Image/inc/ImageConversionHelpers.h
@@ -52,5 +52,6 @@ bool VideoFramesHaveSameDevice(const wm::IVideoFrame& video_frame_1, const wm::I
 wgdx::Direct3D11::IDirect3DDevice GetDeviceFromDirect3DSurface(const wgdx::Direct3D11::IDirect3DSurface& d3dSurface);
 
 constexpr std::array<DXGI_FORMAT, 3> supportedWinMLFormats = {
-  DXGI_FORMAT_R8G8B8A8_UNORM, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_B8G8R8X8_UNORM};
+  DXGI_FORMAT_R8G8B8A8_UNORM, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_B8G8R8X8_UNORM
+};
 }  // namespace _winml::Imaging
diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
index a3d01cdff42fb..138e755289df9 100644
--- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
+++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
@@ -40,10 +40,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
 
  private:
   GUID _d3d11TextureGUID = {
-    0x14bf1054,
-    0x6ce7,
-    0x4c00,
-    {0xa1, 0x32, 0xb0, 0xf2, 0x11, 0x5D, 0xE0, 0x7f}
+    0x14bf1054, 0x6ce7, 0x4c00, {0xa1, 0x32, 0xb0, 0xf2, 0x11, 0x5D, 0xE0, 0x7f}
   };  // {14BF1054-6CE7-4C00-A132-B0F2115DE07F}
   GUID _handleGUID = {
     0x700148fc, 0xc0cb, 0x4a7e, {0xa7, 0xc0, 0xe7, 0x43, 0xc1, 0x9, 0x9d, 0x62}
diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
index 0cd184ca70b21..ebfccabf7e814 100644
--- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
+++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
@@ -50,10 +50,7 @@ class VideoFrameToTensorConverter : public ImageConverter {
 
  private:
   GUID d3d11_texture_GUID_ = {
-    0x485e4bb3,
-    0x3fe8,
-    0x497b,
-    {0x85, 0x9e, 0xc7, 0x5, 0x18, 0xdb, 0x11, 0x2a}
+    0x485e4bb3, 0x3fe8, 0x497b, {0x85, 0x9e, 0xc7, 0x5, 0x18, 0xdb, 0x11, 0x2a}
   };  // {485E4BB3-3FE8-497B-859E-C70518DB112A}
   GUID handle_GUID_ = {
     0xce43264e, 0x41f7, 0x4882, {0x9e, 0x20, 0xfa, 0xa5, 0x1e, 0x37, 0x64, 0xfc}
diff --git a/winml/lib/Api.Ort/OnnxruntimeModel.cpp b/winml/lib/Api.Ort/OnnxruntimeModel.cpp
index fb8413a897e75..24eb44b73dd3c 100644
--- a/winml/lib/Api.Ort/OnnxruntimeModel.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeModel.cpp
@@ -81,7 +81,8 @@ HRESULT ModelInfo::RuntimeClassInitialize(_In_ OnnxruntimeEngineFactory* engine_
     winml_adapter_api->ModelGetInputCount,
     winml_adapter_api->ModelGetInputName,
     winml_adapter_api->ModelGetInputDescription,
-    winml_adapter_api->ModelGetInputTypeInfo};
+    winml_adapter_api->ModelGetInputTypeInfo
+  };
 
   // Create inputs
   std::vector<OnnxruntimeValueInfoWrapper> inputs;
@@ -93,7 +94,8 @@ HRESULT ModelInfo::RuntimeClassInitialize(_In_ OnnxruntimeEngineFactory* engine_
     winml_adapter_api->ModelGetOutputCount,
     winml_adapter_api->ModelGetOutputName,
     winml_adapter_api->ModelGetOutputDescription,
-    winml_adapter_api->ModelGetOutputTypeInfo};
+    winml_adapter_api->ModelGetOutputTypeInfo
+  };
 
   std::vector<OnnxruntimeValueInfoWrapper> outputs;
   RETURN_IF_FAILED(CreateFeatureDescriptors(engine_factory, &output_helpers, ort_model, outputs));
diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h
index c8332e11234b5..6c68abd3ce6c9 100644
--- a/winml/lib/Api/impl/TensorBase.h
+++ b/winml/lib/Api/impl/TensorBase.h
@@ -217,7 +217,8 @@ struct TensorBase : TBase {
     }
 
     D3D12_HEAP_PROPERTIES heapProperties = {
-      D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+      D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+    };
     D3D12_RESOURCE_DESC resourceDesc = {
       D3D12_RESOURCE_DIMENSION_BUFFER,
       0,
diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp
index 81437f9db2de3..b4220650abb9c 100644
--- a/winml/test/adapter/AdapterDmlEpTest.cpp
+++ b/winml/test/adapter/AdapterDmlEpTest.cpp
@@ -116,7 +116,8 @@ std::array<float, tensor_size> tensor_values = {};
 winrt::com_ptr<ID3D12Resource> CreateD3D12Resource(ID3D12Device& device) {
   constexpr uint64_t buffer_size = tensor_size * sizeof(float);
   constexpr D3D12_HEAP_PROPERTIES heap_properties = {
-    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+  };
   constexpr D3D12_RESOURCE_DESC resource_desc = {
     D3D12_RESOURCE_DIMENSION_BUFFER,
     0,
@@ -365,6 +366,7 @@ const AdapterDmlEpTestApi& getapi() {
     DmlCopyTensor,
     CreateCustomRegistry,
     ValueGetDeviceId,
-    SessionGetInputRequiredDeviceId};
+    SessionGetInputRequiredDeviceId
+  };
   return api;
 }
diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp
index 1b1a36004264c..8c9124b2ff4ae 100644
--- a/winml/test/adapter/AdapterSessionTest.cpp
+++ b/winml/test/adapter/AdapterSessionTest.cpp
@@ -368,7 +368,8 @@ const AdapterSessionTestAPI& getapi() {
     Profiling,
     CopyInputAcrossDevices,
     CopyInputAcrossDevices_DML,
-    GetNumberOfIntraOpThreads};
+    GetNumberOfIntraOpThreads
+  };
 
   if (SkipGpuTests()) {
     api.AppendExecutionProvider_DML = SkipTest;
diff --git a/winml/test/api/LearningModelAPITest.cpp b/winml/test/api/LearningModelAPITest.cpp
index ab45e2414854d..01ca2b8930506 100644
--- a/winml/test/api/LearningModelAPITest.cpp
+++ b/winml/test/api/LearningModelAPITest.cpp
@@ -247,9 +247,11 @@ static void CheckLearningModelPixelRange() {
                                           // Normalized_0_1 and image output
                                           L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx",
                                           // Normalized_1_1 and image output
-                                          L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"};
+                                          L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"
+  };
   std::vector<LearningModelPixelRange> pixelRanges = {
-    LearningModelPixelRange::ZeroTo255, LearningModelPixelRange::ZeroToOne, LearningModelPixelRange::MinusOneToOne};
+    LearningModelPixelRange::ZeroTo255, LearningModelPixelRange::ZeroToOne, LearningModelPixelRange::MinusOneToOne
+  };
   for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) {
     LearningModel learningModel = nullptr;
     WINML_EXPECT_NO_THROW(APITest::LoadModel(modelPaths[model_i], learningModel));
@@ -329,7 +331,8 @@ const LearningModelApiTestsApi& getapi() {
     CloseModelCheckEval,
     CloseModelNoNewSessions,
     CheckMetadataCaseInsensitive,
-    CreateCorruptModel};
+    CreateCorruptModel
+  };
 
   if (RuntimeParameterExists(L"noVideoFrameTests")) {
     api.CloseModelCheckEval = SkipTest;
diff --git a/winml/test/api/LearningModelBindingAPITest.cpp b/winml/test/api/LearningModelBindingAPITest.cpp
index b77421e191020..8279f4f89f0ed 100644
--- a/winml/test/api/LearningModelBindingAPITest.cpp
+++ b/winml/test/api/LearningModelBindingAPITest.cpp
@@ -669,7 +669,8 @@ const LearningModelBindingAPITestsApi& getapi() {
     VerifyOutputAfterEvaluateAsyncCalledTwice,
     VerifyOutputAfterImageBindCalledTwice,
     SequenceLengthTensorFloat,
-    SequenceConstructTensorString};
+    SequenceConstructTensorString
+  };
 
   if (SkipGpuTests()) {
     api.GpuSqueezeNet = SkipTest;
diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
index 21cdaa62bc898..4ec79b8a0f4c6 100644
--- a/winml/test/api/LearningModelSessionAPITest.cpp
+++ b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -793,7 +793,8 @@ static void STFT(
   auto n_dfts = static_cast<size_t>(1 + floor((signal_size - dft_size) / hop_size));
   auto input_shape = std::vector<int64_t>{1, INT64(signal_size)};
   auto output_shape = std::vector<int64_t>{
-    INT64(batch_size), INT64(n_dfts), is_onesided ? ((INT64(dft_size) >> 1) + 1) : INT64(dft_size), 2};
+    INT64(batch_size), INT64(n_dfts), is_onesided ? ((INT64(dft_size) >> 1) + 1) : INT64(dft_size), 2
+  };
   auto dft_length = TensorInt64Bit::CreateFromArray({}, {INT64(dft_size)});
 
   auto model =
@@ -1372,7 +1373,8 @@ static void ModelBuilding_GridSample_Internal(LearningModelDeviceKind kind) {
     5.0000f,
     5.0000f,
     10.0000f,
-    10.0000f};
+    10.0000f
+  };
   input_dims = {1, 1, 3, 2};
   grid_dims = {1, 2, 4, 2};
 
@@ -2312,7 +2314,8 @@ const LearningModelSessionAPITestsApi& getapi() {
     ModelBuilding_STFT,
     ModelBuilding_MelSpectrogramOnThreeToneSignal,
     ModelBuilding_MelWeightMatrix,
-    SetName};
+    SetName
+  };
 
   if (SkipGpuTests()) {
     api.CreateSessionDeviceDirectX = SkipTest;
diff --git a/winml/test/api/RawApiHelpers.cpp b/winml/test/api/RawApiHelpers.cpp
index b6f39f8e88224..e84af6d239799 100644
--- a/winml/test/api/RawApiHelpers.cpp
+++ b/winml/test/api/RawApiHelpers.cpp
@@ -38,7 +38,8 @@ void RunOnDevice(ml::learning_model& model, ml::learning_model_device& device, I
     auto channel_buffers_pointers = std::vector<float*>{
       &input_data.at(0),
       &input_data.at(0) + channel_buffers_sizes[0],
-      &input_data.at(0) + channel_buffers_sizes[0] + +channel_buffers_sizes[1]};
+      &input_data.at(0) + channel_buffers_sizes[0] + +channel_buffers_sizes[1]
+    };
 
     WINML_EXPECT_HRESULT_SUCCEEDED(binding->bind_as_references<float>(
       input_name,
diff --git a/winml/test/api/RawApiTestsGpu.cpp b/winml/test/api/RawApiTestsGpu.cpp
index 9c1c06a01603f..f12ba0f36cebf 100644
--- a/winml/test/api/RawApiTestsGpu.cpp
+++ b/winml/test/api/RawApiTestsGpu.cpp
@@ -165,7 +165,8 @@ const RawApiTestsGpuApi& getapi() {
     CreateDirectXMinPowerDevice,
     Evaluate,
     EvaluateNoInputCopy,
-    EvaluateManyBuffers};
+    EvaluateManyBuffers
+  };
 
   if (SkipGpuTests()) {
     api.CreateDirectXDevice = SkipTest;
diff --git a/winml/test/concurrency/ConcurrencyTests.cpp b/winml/test/concurrency/ConcurrencyTests.cpp
index 46528ef70d377..404afbf67ea1c 100644
--- a/winml/test/concurrency/ConcurrencyTests.cpp
+++ b/winml/test/concurrency/ConcurrencyTests.cpp
@@ -141,7 +141,8 @@ void EvalAsyncDifferentBindings() {
   std::vector<EvaluationUnit> evaluation_units(num_units, EvaluationUnit());
 
   std::vector<ImageFeatureValue> ifvs = {
-    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")};
+    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")
+  };
 
   // same session, different binding
   auto model = LearningModel::LoadFromFilePath(FileHelpers::GetModulePath() + L"model.onnx");
@@ -191,7 +192,8 @@ void MultiThreadMultiSessionOnDevice(const LearningModelDevice& device) {
   auto path = FileHelpers::GetModulePath() + L"model.onnx";
   auto model = LearningModel::LoadFromFilePath(path);
   std::vector<ImageFeatureValue> ivfs = {
-    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")};
+    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")
+  };
   std::vector<int> max_indices = {
     281,  // tabby, tabby cat
     0     // tench, Tinca tinca
@@ -257,7 +259,8 @@ void MultiThreadSingleSessionOnDevice(const LearningModelDevice& device) {
   LearningModelSession model_session = nullptr;
   WINML_EXPECT_NO_THROW(model_session = LearningModelSession(model, device));
   std::vector<ImageFeatureValue> ivfs = {
-    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")};
+    FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")
+  };
   std::vector<int> max_indices = {
     281,  // tabby, tabby cat
     0     // tench, Tinca tinca
@@ -322,7 +325,8 @@ const ConcurrencyTestsApi& getapi() {
     MultiThreadSingleSessionGpu,
     EvalAsyncDifferentModels,
     EvalAsyncDifferentSessions,
-    EvalAsyncDifferentBindings};
+    EvalAsyncDifferentBindings
+  };
 
   if (SkipGpuTests()) {
     api.MultiThreadMultiSessionGpu = SkipTest;
diff --git a/winml/test/image/imageTestHelper.cpp b/winml/test/image/imageTestHelper.cpp
index b7c1eb42965f8..91eed2a807782 100644
--- a/winml/test/image/imageTestHelper.cpp
+++ b/winml/test/image/imageTestHelper.cpp
@@ -148,7 +148,8 @@ TensorFloat LoadInputImageFromGPU(SoftwareBitmap softwareBitmap, const std::wstr
   // 3 is number of channels we use. R G B without alpha.
   UINT64 bufferbytesize = 3 * sizeof(float) * softwareBitmap.PixelWidth() * softwareBitmap.PixelHeight();
   D3D12_HEAP_PROPERTIES heapProperties = {
-    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+  };
   D3D12_RESOURCE_DESC resourceDesc = {
     D3D12_RESOURCE_DIMENSION_BUFFER,
     0,
diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp
index 6157520ca96a3..2251954c59e4c 100644
--- a/winml/test/image/imagetests.cpp
+++ b/winml/test/image/imagetests.cpp
@@ -939,7 +939,8 @@ TEST_F(ImageTests, ImageBindingAsGPUTensor) {
   UINT64 buffer_byte_size =
     static_cast<uint64_t>(software_bitmap.PixelWidth()) * software_bitmap.PixelHeight() * 3 * sizeof(float);
   D3D12_HEAP_PROPERTIES heap_properties = {
-    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+  };
   D3D12_RESOURCE_DESC resource_desc = {
     D3D12_RESOURCE_DIMENSION_BUFFER,
     0,
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index cb5cbbecb5ef0..f40f08ad2696d 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -232,7 +232,8 @@ static std::vector<ITestCase*> GetAllTestCases() {
     ORT_TSTR("tf_resnet_v2_152"),
     ORT_TSTR("vgg19"),
     ORT_TSTR("yolov3"),
-    ORT_TSTR("zfnet512")};
+    ORT_TSTR("zfnet512")
+  };
   allDisabledTests.insert(std::begin(x86DisabledTests), std::end(x86DisabledTests));
 #endif
   // Bad onnx test output caused by previously wrong SAME_UPPER/SAME_LOWER for ConvTranspose
diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index 174f57143ee81..cf55d8bcbae7e 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -161,10 +161,8 @@ std::unordered_map<std::string, std::pair<std::string, std::string>> disabledGpu
     test name -> absolute difference sampleTolerance
 */
 std::unordered_map<std::string, double> sampleTolerancePerTests({
-  {"fp16_inception_v1_opset7_GPU",0.005                                  },
-  {"fp16_inception_v1_opset8_GPU", 0.005},
-  {            "candy_opset9_GPU",
-   0.00150000                           }, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
-  { "fp16_tiny_yolov2_opset8_GPU",
-   0.109000                             }, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
+  {"fp16_inception_v1_opset7_GPU",      0.005},
+  {"fp16_inception_v1_opset8_GPU",      0.005},
+  {            "candy_opset9_GPU", 0.00150000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
+  { "fp16_tiny_yolov2_opset8_GPU",   0.109000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
 });
diff --git a/winml/test/scenario/cppwinrt/CustomNullOp.h b/winml/test/scenario/cppwinrt/CustomNullOp.h
index 33709c5f72d3c..b50909548a6bf 100644
--- a/winml/test/scenario/cppwinrt/CustomNullOp.h
+++ b/winml/test/scenario/cppwinrt/CustomNullOp.h
@@ -69,7 +69,8 @@ struct NullOperatorFactory : winrt::implements<NullOperatorFactory, IMLOperatorK
     std::vector<MLOperatorEdgeDescription> allowedEdges{
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double),
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float),
-      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)};
+      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)
+    };
     typeConstraint.allowedTypes = allowedEdges.data();
     typeConstraint.allowedTypeCount = static_cast<uint32_t>(allowedEdges.size());
 
diff --git a/winml/test/scenario/cppwinrt/CustomOps.cpp b/winml/test/scenario/cppwinrt/CustomOps.cpp
index 075bf5ed877a3..58d0fe6e64efc 100644
--- a/winml/test/scenario/cppwinrt/CustomOps.cpp
+++ b/winml/test/scenario/cppwinrt/CustomOps.cpp
@@ -305,7 +305,8 @@ static void CustomKernelWithBuiltInSchema() {
 
   // Register the kernel
   MLOperatorEdgeDescription floatTensorType = {
-    MLOperatorEdgeType::Tensor, static_cast<uint64_t>(MLOperatorTensorDataType::Float)};
+    MLOperatorEdgeType::Tensor, static_cast<uint64_t>(MLOperatorTensorDataType::Float)
+  };
 
   MLOperatorEdgeTypeConstrant constraint = {"T", &floatTensorType, 1};
 
@@ -318,7 +319,8 @@ static void CustomKernelWithBuiltInSchema() {
     1,
     nullptr,
     0,
-    MLOperatorKernelOptions::AllowDynamicInputShapes};
+    MLOperatorKernelOptions::AllowDynamicInputShapes
+  };
 
   Microsoft::WRL::ComPtr<MLOperatorKernelFactory> factory =
     wil::MakeOrThrow<MLOperatorKernelFactory>(CreateABIFooKernel<false>);
@@ -614,7 +616,8 @@ static void CustomKernelWithCustomSchema() {
     MLOperatorEdgeTypeConstrant kernelConstraint = {"T1", &floatTensorEdgeDesc, 1};
 
     MLOperatorKernelDescription kernelDesc = {
-      "", "Foo", 7, MLOperatorExecutionType::Cpu, &kernelConstraint, testCases[caseIndex].useTypeLabel ? 1u : 0u};
+      "", "Foo", 7, MLOperatorExecutionType::Cpu, &kernelConstraint, testCases[caseIndex].useTypeLabel ? 1u : 0u
+    };
 
     if (!testCases[caseIndex].attributeDefaultsInSchema) {
       kernelDesc.defaultAttributes = defaultAttributes;
@@ -693,10 +696,8 @@ static void CustomKernelWithCustomSchema() {
 
 const CustomOpsTestsApi& getapi() {
   static CustomOpsTestsApi api = {
-    CustomOpsScenarioTestsClassSetup,
-    CustomOperatorFusion,
-    CustomKernelWithBuiltInSchema,
-    CustomKernelWithCustomSchema};
+    CustomOpsScenarioTestsClassSetup, CustomOperatorFusion, CustomKernelWithBuiltInSchema, CustomKernelWithCustomSchema
+  };
 
   if (SkipGpuTests()) {
     api.CustomOperatorFusion = SkipTest;
diff --git a/winml/test/scenario/cppwinrt/NoisyReluCpu.h b/winml/test/scenario/cppwinrt/NoisyReluCpu.h
index 5f89b20beebb9..5cccbae67407c 100644
--- a/winml/test/scenario/cppwinrt/NoisyReluCpu.h
+++ b/winml/test/scenario/cppwinrt/NoisyReluCpu.h
@@ -157,7 +157,8 @@ struct NoisyReluOperatorFactory : winrt::implements<NoisyReluOperatorFactory, IM
     std::vector<MLOperatorEdgeDescription> allowedEdges{
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double),
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float),
-      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)};
+      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)
+    };
     typeConstraint.allowedTypes = allowedEdges.data();
     typeConstraint.allowedTypeCount = static_cast<uint32_t>(allowedEdges.size());
 
@@ -194,7 +195,8 @@ struct NoisyReluOperatorFactory : winrt::implements<NoisyReluOperatorFactory, IM
     noisyReluVarianceAttributeValue.floats = defaultVariance;
 
     std::vector<MLOperatorAttributeNameValue> attributeDefaultValues{
-      noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue};
+      noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue
+    };
     noisyReluSchema.defaultAttributes = attributeDefaultValues.data();
     noisyReluSchema.defaultAttributeCount = static_cast<uint32_t>(attributeDefaultValues.size());
 
@@ -216,7 +218,8 @@ struct NoisyReluOperatorFactory : winrt::implements<NoisyReluOperatorFactory, IM
     std::vector<MLOperatorEdgeDescription> allowedEdges{
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double),
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float),
-      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)};
+      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)
+    };
     typeConstraint.allowedTypes = allowedEdges.data();
     typeConstraint.allowedTypeCount = static_cast<uint32_t>(allowedEdges.size());
 
@@ -239,7 +242,8 @@ struct NoisyReluOperatorFactory : winrt::implements<NoisyReluOperatorFactory, IM
     noisyReluVarianceAttributeValue.floats = defaultVariance;
 
     std::vector<MLOperatorAttributeNameValue> attributeDefaultValues{
-      noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue};
+      noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue
+    };
     kernelDescription.defaultAttributes = attributeDefaultValues.data();
     kernelDescription.defaultAttributeCount = static_cast<uint32_t>(attributeDefaultValues.size());
     kernelDescription.options = MLOperatorKernelOptions::None;
diff --git a/winml/test/scenario/cppwinrt/ReluCpu.h b/winml/test/scenario/cppwinrt/ReluCpu.h
index c72285a4de7fb..7bb275f7b399b 100644
--- a/winml/test/scenario/cppwinrt/ReluCpu.h
+++ b/winml/test/scenario/cppwinrt/ReluCpu.h
@@ -114,7 +114,8 @@ struct ReluOperatorFactory : winrt::implements<ReluOperatorFactory, IMLOperatorK
     std::vector<MLOperatorEdgeDescription> allowedEdges{
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double),
       CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float),
-      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)};
+      CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)
+    };
     typeConstraint.allowedTypes = allowedEdges.data();
     typeConstraint.allowedTypeCount = static_cast<uint32_t>(allowedEdges.size());
 
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index 9b389d014c953..9a03172340bf7 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -510,7 +510,8 @@ static void Scenario9LoadBindEvalInputTensorGPU() {
 
   UINT64 bufferbytesize = 720 * 720 * 3 * sizeof(float);
   D3D12_HEAP_PROPERTIES heapProperties = {
-    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+  };
   D3D12_RESOURCE_DESC resourceDesc = {
     D3D12_RESOURCE_DIMENSION_BUFFER,
     0,
@@ -983,7 +984,8 @@ static void Scenario22ImageBindingAsGPUTensor() {
   // 3 is number of channels we use. R G B without alpha.
   UINT64 bufferbytesize = 3 * sizeof(float) * softwareBitmap.PixelWidth() * softwareBitmap.PixelHeight();
   D3D12_HEAP_PROPERTIES heapProperties = {
-    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+    D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0
+  };
   D3D12_RESOURCE_DESC resourceDesc = {
     D3D12_RESOURCE_DIMENSION_BUFFER,
     0,
@@ -1085,7 +1087,8 @@ static void Scenario23NominalPixelRange() {
   std::vector<std::wstring> modelPaths = {// Normalized_0_1 and image output
                                           modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx",
                                           // Normalized_1_1 and image output
-                                          modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"};
+                                          modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"
+  };
 
   for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) {
     // load model and create session

From 885bf3561da143ad90d30b1d5959a9b065f5a6c9 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 9 Nov 2023 10:12:57 +1000
Subject: [PATCH 10/13] Add tool to fix lines > 120 chars. (#18293)

### Description
<!-- Describe your changes. -->
Helper to run clang-format on lines that are > 120 chars.

We disable clang-format enforcing 120 chars by default because it's
formatting can negatively impact readability. If a developer has not
manually kept a line within the 120 char limit this tool will fix it. It
will leave all other lines alone to honor the formatting the developer
chose.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Help developers fix lint errors.

Preferred is to use a vertical ruler/guideline in your editor when
actually writing the code.
---
 tools/python/fix_long_lines.py | 134 +++++++++++++++++++++++++++++++++
 tools/python/util/logger.py    |   4 +-
 2 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 tools/python/fix_long_lines.py

diff --git a/tools/python/fix_long_lines.py b/tools/python/fix_long_lines.py
new file mode 100644
index 0000000000000..383fdc9623551
--- /dev/null
+++ b/tools/python/fix_long_lines.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import logging
+import os
+import pathlib
+import shutil
+import tempfile
+
+from util import logger, run
+
+_log = logger.get_logger("fix_long_lines", logging.INFO)
+
+
+# look for long lines in the file, and if found run clang-format on those lines
+def _process_files(filenames, clang_exe, tmpdir):
+    for path in filenames:
+        _log.debug(f"Checking {path}")
+        bad_lines = []
+
+        with open(path, encoding="UTF8") as f:
+            line_num = 0
+            for line in f:
+                line_num += 1  # clang-format line numbers start at 1
+                if len(line) > 120:
+                    bad_lines.append(line_num)
+
+        if bad_lines:
+            _log.info(f"Updating {path}")
+            filename = os.path.basename(path)
+            target = os.path.join(tmpdir, filename)
+            shutil.copy(path, target)
+
+            # run clang-format to update just the long lines in the file
+            cmd = [
+                clang_exe,
+                "-i",
+            ]
+            for line in bad_lines:
+                cmd.append(f"--lines={line}:{line}")
+
+            cmd.append(target)
+
+            run(*cmd, cwd=tmpdir, check=True, shell=True)
+
+            # copy updated file back to original location
+            shutil.copy(target, path)
+
+
+# file extensions we process
+_EXTENSIONS = [".cc", ".h"]
+
+
+def _get_branch_diffs(ort_root, branch):
+    command = ["git", "diff", branch, "--name-only"]
+    result = run(*command, capture_stdout=True, check=True)
+
+    # stdout is bytes. one filename per line. decode, split, and filter to the extensions we are looking at
+    for f in result.stdout.decode("utf-8").splitlines():
+        if os.path.splitext(f.lower())[-1] in _EXTENSIONS:
+            yield os.path.join(ort_root, f)
+
+
+def _get_file_list(path):
+    for root, _, files in os.walk(path):
+        for file in files:
+            if os.path.splitext(file.lower())[-1] in _EXTENSIONS:
+                yield os.path.join(root, file)
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        "Script to fix long lines in the source using clang-format. "
+        "Only lines that exceed the 120 character maximum are altered in order to minimize the impact. "
+        "Checks .cc and .h files",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    argparser.add_argument(
+        "--branch",
+        type=str,
+        default="origin/main",
+        help="Limit changes to files that differ from this branch. Use origin/main when preparing a PR.",
+    )
+
+    argparser.add_argument(
+        "--all_files",
+        action="store_true",
+        help="Process all files under /include/onnxruntime and /onnxruntime/core. Ignores --branch value.",
+    )
+
+    argparser.add_argument(
+        "--clang-format",
+        type=pathlib.Path,
+        required=False,
+        default="clang-format",
+        help="Path to clang-format executable",
+    )
+
+    argparser.add_argument("--debug", action="store_true", help="Set log level to DEBUG.")
+
+    args = argparser.parse_args()
+
+    if args.debug:
+        _log.setLevel(logging.DEBUG)
+
+    script_dir = os.path.dirname(os.path.realpath(__file__))
+    ort_root = os.path.abspath(os.path.join(script_dir, "..", ".."))
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # create config in tmpdir
+        with open(os.path.join(tmpdir, ".clang-format"), "w") as f:
+            f.write(
+                """
+            BasedOnStyle: Google
+            ColumnLimit: 120
+            DerivePointerAlignment: false
+            """
+            )
+
+        clang_format = str(args.clang_format)
+
+        if args.all_files:
+            include_path = os.path.join(ort_root, "include", "onnxruntime")
+            src_path = os.path.join(ort_root, "onnxruntime", "core")
+            _process_files(_get_file_list(include_path), clang_format, tmpdir)
+            _process_files(_get_file_list(src_path), clang_format, tmpdir)
+        else:
+            _process_files(_get_branch_diffs(ort_root, args.branch), clang_format, tmpdir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/python/util/logger.py b/tools/python/util/logger.py
index 15e04528ac7ac..d6f3026959daa 100644
--- a/tools/python/util/logger.py
+++ b/tools/python/util/logger.py
@@ -4,8 +4,8 @@
 import logging
 
 
-def get_logger(name):
+def get_logger(name, level=logging.DEBUG):
     logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s")
     logger = logging.getLogger(name)
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(level)
     return logger

From 4dc63692f83ad1217e0964f6efbf78f4c83dfb60 Mon Sep 17 00:00:00 2001
From: guyang3532 <62738430+guyang3532@users.noreply.github.com>
Date: Thu, 9 Nov 2023 09:52:48 +0800
Subject: [PATCH 11/13] Add FlattenAndUnpad Op (#17845)

### Description
Add an op named `FlattenAndUnpad`.
This op implements functions:
1. Flatten the first two dims of input tensor.
2. Gather valid value from input tensor with index tensor,.


### Motivation and Context
The grad op of `PadAndUnflatten` was `GatherGrad` which is inefficient
in performance.
I implement this `FlattenAndUnpad` just to replace the `GatherGrad` as
grad of `PadAndUnflatten`.
With this op, we also can simplify the "Reshape + ShrunkenGather"
pattern to `PadAndUnflatten` in padding elimination optimizer, which
will also improve performance.
---
 .../core/graph/gradient_builder.cc            |  17 +-
 .../orttraining/core/graph/gradient_builder.h |   1 +
 .../core/graph/gradient_builder_registry.cc   |   1 +
 .../core/graph/training_op_defs.cc            |  26 ++-
 .../compute_optimizer/padding_elimination.cc  |  90 +++-------
 .../test/gradient/gradient_ops_test.cc        |   3 +-
 .../python/orttraining_test_ortmodule_api.py  |  12 +-
 .../cuda/flatten_and_unpad_test.cc            | 157 ++++++++++++++++++
 .../cuda/pad_and_unflatten_test.cc            |  12 --
 .../cuda/cuda_training_kernels.cc             |   2 +
 .../cuda/tensor/flatten_and_unpad.cc          |  91 ++++++++++
 .../cuda/tensor/flatten_and_unpad.h           |  21 +++
 .../cuda/tensor/flatten_and_unpad_impl.cu     |  83 +++++++++
 .../cuda/tensor/flatten_and_unpad_impl.h      |  25 +++
 .../cuda/tensor/pad_and_unflatten.cc          |  11 +-
 .../cuda/tensor/pad_and_unflatten_impl.cu     |  12 +-
 .../rocm/rocm_training_kernels.cc             |   2 +
 17 files changed, 448 insertions(+), 118 deletions(-)
 create mode 100644 orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc
 create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc
 create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h
 create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu
 create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h

diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 7100cedaf78a0..755a8e49d9d12 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -791,13 +791,16 @@ IMPLEMENT_GRADIENT_BUILDER(GetGatherGradient) {
 
 IMPLEMENT_GRADIENT_BUILDER(GetPadAndUnflattenGradient) {
   return std::vector<NodeDef>{
-      NodeDef(OpDef("Reshape"),
-              {GO(0), O(1)},
-              {IA("GO_reshaped")}),
-      NodeDef(OpDef{"Gather", kOnnxDomain, 1},
-              {IA("GO_reshaped"), I(1)},
-              {GI(0)},
-              SrcNodeAttributes())};
+      NodeDef(OpDef{"FlattenAndUnpad", kMSDomain, 1},
+              {GO(0), I(1)},
+              {GI(0), IA("Unflatten_dims")})};
+}
+
+IMPLEMENT_GRADIENT_BUILDER(GetFlattenAndUnpadGradient) {
+  return std::vector<NodeDef>{
+      NodeDef(OpDef{"PadAndUnflatten", kMSDomain, 1},
+              {GO(0), I(1), O(1)},
+              {GI(0)})};
 }
 
 IMPLEMENT_GRADIENT_BUILDER(GetShrunkenGatherGradient) {
diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h
index 08987a86ebda9..92bfae9cd83a4 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.h
+++ b/orttraining/orttraining/core/graph/gradient_builder.h
@@ -40,6 +40,7 @@ DECLARE_GRADIENT_BUILDER(GetAveragePoolGradient)
 DECLARE_GRADIENT_BUILDER(GetMaxPoolGradient)
 DECLARE_GRADIENT_BUILDER(GetGatherGradient)
 DECLARE_GRADIENT_BUILDER(GetPadAndUnflattenGradient)
+DECLARE_GRADIENT_BUILDER(GetFlattenAndUnpadGradient)
 DECLARE_GRADIENT_BUILDER(GetShrunkenGatherGradient)
 DECLARE_GRADIENT_BUILDER(GetConvGradient)
 DECLARE_GRADIENT_BUILDER(GetUnsqueezeGradient)
diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
index f280a02cb490f..ea56be9e6dfa3 100755
--- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
@@ -72,6 +72,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
   REGISTER_GRADIENT_BUILDER("MaxPool", GetMaxPoolGradient);
   REGISTER_GRADIENT_BUILDER("Gather", GetGatherGradient);
   REGISTER_GRADIENT_BUILDER("PadAndUnflatten", GetPadAndUnflattenGradient);
+  REGISTER_GRADIENT_BUILDER("FlattenAndUnpad", GetFlattenAndUnpadGradient);
   REGISTER_GRADIENT_BUILDER("ShrunkenGather", GetShrunkenGatherGradient);
   REGISTER_GRADIENT_BUILDER("Conv", GetConvGradient);
   REGISTER_GRADIENT_BUILDER("Squeeze", GetSqueezeGradient);
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index 283883c2e33c6..8d3f76be20c65 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -4741,7 +4741,7 @@ Return true if all elements are true and false otherwise.
           "For other indices,  the corresponding value in output will be padded to zero."
 
           "The indices don't allow duplicated index values, otherwise, though there is no runtime check"
-          "(in case of performance concern), the behaviour of output is undefined."
+          "(in case of performance concern), the behavior of output is undefined."
 
           "An example:"
           "  input: [[1, 2, 3, 4], [5, 6, 7, 8]], shape is [2, 4]"
@@ -4749,14 +4749,12 @@ Return true if all elements are true and false otherwise.
           "  unflatten_dims: [2, 3], shape is [2]"
 
           "  output: [[[1, 2, 3, 4], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [5, 6, 7, 8]]],"
-          "  shape is [2, 3, 4]"
-          "  flatten_output_shape: [6, 4], shape is [2]")
+          "  shape is [2, 3, 4]")
       .Input(0, "input", "input data of rank N, shape is [d1, d2, ..., dN]", "T")
       .Input(1, "indices", "1D Tensor of int32/int64 indices, shape is [d1], each element's value ranges in [0, M1*M2).",
              "T_INDEX")
       .Input(2, "unflatten_dims", "1D tensor with two values, [M1, M2].", "T_INT")
       .Output(0, "output", "output data of rank N+1, [M1, M2, d2, ..., dN]", "T")
-      .Output(1, "flatten_output_shape", "1D tensor with output shape, [M1*M2, d2, ..., dN]", "T_INT")
       .TypeConstraint(
           "T_INT",
           {"tensor(int32)", "tensor(int64)"},
@@ -4770,6 +4768,26 @@ Return true if all elements are true and false otherwise.
           {"tensor(int32)", "tensor(int64)"},
           "Constrain indices to integer types");
 
+  ONNX_CONTRIB_OPERATOR_SCHEMA(FlattenAndUnpad)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(
+          "FlattenAndUnpad operator flattens the first two dims of input tensor, and unpad according to given indices."
+          "This is used by padding elimination graph transformer.")
+      .Input(0, "input", "input data of rank N + 1, shape is [M1, M2, d2, ..., dN]", "T")
+      .Input(1, "indices", "1D Tensor of int32/int64 indices, shape is [d1], each element's value ranges in [0, M1*M2).",
+             "T_INT")
+      .Output(0, "output", "output data of rank N, [d1, d2, ..., dN]", "T")
+      .Output(1, "unflatten_dims", "1D tensor with two values, [M1, M2].", "T_INT")
+      .TypeConstraint(
+          "T_INT",
+          {"tensor(int32)", "tensor(int64)"},
+          "Constrain indices and shape to integer tensors.")
+      .TypeConstraint(
+          "T",
+          {"tensor(int32)", "tensor(int64)", "tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+          "Constrain input and output types to float tensors.");
+
   ONNX_CONTRIB_OPERATOR_SCHEMA(GRUTraining)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
index 74247c059cf84..73638e8ba62a0 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -129,91 +129,43 @@ NodeArg* InsertExpandForNodeInput(Graph& graph,
   return new_expand_node->MutableOutputDefs()[0];
 }
 
-// Insert Reshape + ShrunkenGather to flatten the in_index-th input of node.
+// Insert FlattenAndUnpad to flatten and unpad the in_index-th input of node.
 // The gather_index_arg is the indices of the elements that are not padding.
 NodeArg* InsertFlattenPatternForInput(Graph& graph,
                                       Node& node,
                                       uint32_t in_index,
                                       NodeArg* gather_index_arg,
                                       const logging::Logger& logger) {
-  InlinedVector<NodeArg*> reshape_input_args;
-  reshape_input_args.reserve(2);
-  reshape_input_args.push_back(node.MutableInputDefs()[in_index]);
-  std::vector<int64_t> new_shape;
-  new_shape.push_back(-1);  // only support flatten 0 and 1 dims
-  auto input_shape = node.InputDefs()[in_index]->Shape();
-  ORT_ENFORCE(input_shape->dim_size() >= 2);
-  ONNX_NAMESPACE::TensorShapeProto flattened_shape;
-  if (input_shape->dim(0).has_dim_value() && input_shape->dim(1).has_dim_value()) {
-    flattened_shape.add_dim()->set_dim_value(input_shape->dim(0).dim_value() * input_shape->dim(1).dim_value());
-  } else {
-    std::string token_dim_name = MakeString("total_token_count_", utils::GetRandomSeed());
-    flattened_shape.add_dim()->set_dim_param(token_dim_name);
-  }
-  for (int k = 2; k < input_shape->dim_size(); k++) {
-    ORT_ENFORCE(input_shape->dim(k).has_dim_value());
-    new_shape.push_back(input_shape->dim(k).dim_value());
-    flattened_shape.add_dim()->set_dim_value(input_shape->dim(k).dim_value());
-  }
-  ONNX_NAMESPACE::TensorProto new_shape_const_tensor;
-  new_shape_const_tensor.set_name(graph.GenerateNodeArgName("new_shape"));
-  new_shape_const_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  new_shape_const_tensor.add_dims(new_shape.size());
-  new_shape_const_tensor.set_raw_data(new_shape.data(), new_shape.size() * sizeof(int64_t));
-  NodeArg* new_shape_arg = &graph_utils::AddInitializer(graph, new_shape_const_tensor);
-  reshape_input_args.push_back(new_shape_arg);
-
-  InlinedVector<NodeArg*> reshape_output_args;
-  reshape_output_args.push_back(
-      &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("inputs_reshape_result"),
-                                node.MutableInputDefs()[in_index]->TypeAsProto()));
-
-  Node* new_reshape_node = InsertIntermediateNodeOnDestInput(
-      graph, node,
-      in_index,
-      0,
-      0,
-      graph.GenerateNodeName("Reshape"),
-      "Reshape",
-      "Reshape node to filter invalid tokens.",
-      reshape_input_args,
-      reshape_output_args,
-      {},
-      "",
-      logger);
+  InlinedVector<NodeArg*> unpad_input_args;
+  unpad_input_args.reserve(2);
+  unpad_input_args.push_back(node.MutableInputDefs()[in_index]);
+  unpad_input_args.push_back(gather_index_arg);
 
-  new_reshape_node->SetExecutionProviderType(node.GetExecutionProviderType());
-  auto reshape_out_arg = new_reshape_node->MutableOutputDefs()[0];
-
-  reshape_out_arg->SetShape(flattened_shape);
-
-  InlinedVector<NodeArg*> gather_input_args;
-  gather_input_args.reserve(2);
-  gather_input_args.push_back(reshape_output_args[0]);
-  gather_input_args.push_back(gather_index_arg);
-
-  InlinedVector<NodeArg*> gather_output_args;
-  gather_output_args.push_back(
+  InlinedVector<NodeArg*> unpad_output_args;
+  unpad_output_args.push_back(
       &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padding_filter_result"),
-                                reshape_out_arg->TypeAsProto()));
+                                nullptr));
+  unpad_output_args.push_back(
+      &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("d1_d2_shape"),
+                                nullptr));
 
-  Node* new_gather_node = InsertIntermediateNodeOnDestInput(
+  Node* unpad_node = InsertIntermediateNodeOnDestInput(
       graph, node,
       in_index,
       0,
       0,
       graph.GenerateNodeName("PaddingFilter"),
-      "ShrunkenGather",
-      "ShrunkenGather node to filter invalid tokens.",
-      gather_input_args,
-      gather_output_args,
+      "FlattenAndUnpad",
+      "FlattenAndUnpad node to filter invalid tokens.",
+      unpad_input_args,
+      unpad_output_args,
       {},
       kMSDomain,
       logger);
 
-  new_gather_node->SetExecutionProviderType(node.GetExecutionProviderType());
-  auto gather_out_arg = new_gather_node->MutableOutputDefs()[0];
-  return gather_out_arg;
+  unpad_node->SetExecutionProviderType(node.GetExecutionProviderType());
+  auto unpad_out_arg = unpad_node->MutableOutputDefs()[0];
+  return unpad_out_arg;
 }
 
 // Insert PadAndUnflatten to unflatten the shape of the in_index-th input of node.
@@ -236,10 +188,6 @@ NodeArg* InsertNodesForOutput(Graph& graph,
   pad_node_output_args.push_back(
       &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padded_result"),
                                 nullptr));
-  pad_node_output_args.push_back(
-      &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padded_d1xd2_shape"),
-                                nullptr));
-
   Node* new_gathergrad_node = InsertIntermediateNodeOnDestInput(
       graph, node,
       in_index,
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 890a1bbccbc92..6fb42dd59b6a0 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -3011,7 +3011,6 @@ TEST(GradientCheckerTest, PadAndUnflattenGrad) {
   std::vector<std::vector<float>> x_datas = {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {3, 5, 0, 1}, {5, 2}};
 
   TensorInfo padded_out_info({5, 2, 3}, true);
-  TensorInfo out_shape_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
@@ -3021,7 +3020,7 @@ TEST(GradientCheckerTest, PadAndUnflattenGrad) {
 #endif
 
   ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indices_info, shape_info},
-                                                         {padded_out_info, out_shape_info}, &max_error,
+                                                         {padded_out_info}, &max_error,
                                                          x_datas, {}, true, false, &execution_providers));
   EXPECT_IS_TINY(max_error);
 }
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index c8ec2e52f3078..13024b81f4b3c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -5786,14 +5786,14 @@ def __init__(self, vocab_size, hidden_size, pad_token_id):
         #            the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be
         #            added to output of test_op.
         # in case 2, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, 1, hidden_size],
-        #            the test_op should be included in padding elimination subgraph and a 'Expand + Reshape + ShrunkenGather'
+        #            the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad'
         #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
         # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size],
-        #            the test_op should be included in padding elimination subgraph and a 'Expand + Reshape + ShrunkenGather'
+        #            the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad'
         #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
         # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size],
         #            the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to
-        #            output of test_op. Besides, the other input of Add should be added 'Reshape + ShrunkenGather' to
+        #            output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to
         #            flatten and elimination padding.
         def test_elementwise(self, input_ids):
             input_shape = input_ids.size()
@@ -5905,9 +5905,9 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     assert len([node.op_type for node in training_model.graph.node if node.op_type == "Squeeze"]) == 1
     assert len([node.op_type for node in training_model.graph.node if node.op_type == "PadAndUnflatten"]) == 1
     if case >= 2:
-        assert len([node.op_type for node in training_model.graph.node if node.op_type == "ShrunkenGather"]) == 2
+        assert len([node.op_type for node in training_model.graph.node if node.op_type == "FlattenAndUnpad"]) == 3
     else:
-        assert len([node.op_type for node in training_model.graph.node if node.op_type == "ShrunkenGather"]) == 1
+        assert len([node.op_type for node in training_model.graph.node if node.op_type == "FlattenAndUnpad"]) == 2
     gathergrad_node = next(node for node in training_model.graph.node if node.op_type == "PadAndUnflatten")
 
     def find_input_node_type(model, arg):
@@ -6071,7 +6071,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
             _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-4)
 
     training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model
-    assert "ShrunkenGather" in [node.op_type for node in training_model.graph.node]
+    assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node]
     assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node]
     del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
 
diff --git a/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc b/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc
new file mode 100644
index 0000000000000..dd5fa18ab3edd
--- /dev/null
+++ b/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc
@@ -0,0 +1,157 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test/common/tensor_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+
+TEST(FlattenAndUnpadTest, Int32Type2D) {
+  std::vector<int32_t> input = {1, 1, 3, 2, 0, 3, 0, 4,
+                                0, 5, 0, 6, 0, 0, 0};
+  std::vector<int64_t> indices = {1, 3, 5, 7, 9, 11};
+
+  std::vector<int32_t> output = {1, 2, 3, 4, 5, 6};
+  std::vector<int64_t> unflatten_dims = {5, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<int32_t>("input", {5, 3}, input);
+  test.AddInput<int64_t>("indices", {6}, indices);
+  test.AddOutput<int32_t>("output", {6}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, Int32Type3D) {
+  std::vector<int32_t> input = {0, 0, 0, 1, 2, 3, 0, 0, 0,
+                                4, 5, 6, 7, 8, 9, 0, 0, 0};
+  std::vector<int64_t> indices = {1, 3, 4};
+
+  std::vector<int32_t> output = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<int64_t> unflatten_dims = {2, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<int32_t>("input", {2, 3, 3}, input);
+  test.AddInput<int64_t>("indices", {3}, indices);
+  test.AddOutput<int32_t>("output", {3, 3}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, Int64Type2D) {
+  std::vector<int64_t> input = {1, 1, 3, 2, 0, 3, 0, 4,
+                                0, 5, 0, 6, 0, 0, 0};
+  std::vector<int64_t> indices = {1, 3, 5, 7, 9, 11};
+
+  std::vector<int64_t> output = {1, 2, 3, 4, 5, 6};
+  std::vector<int64_t> unflatten_dims = {5, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<int64_t>("input", {5, 3}, input);
+  test.AddInput<int64_t>("indices", {6}, indices);
+  test.AddOutput<int64_t>("output", {6}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, Int64Type3D) {
+  std::vector<int64_t> input = {0, 0, 0, 1, 2, 3, 0, 0, 0,
+                                4, 5, 6, 7, 8, 9, 0, 0, 0};
+  std::vector<int64_t> indices = {1, 3, 4};
+
+  std::vector<int64_t> output = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<int64_t> unflatten_dims = {2, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<int64_t>("input", {2, 3, 3}, input);
+  test.AddInput<int64_t>("indices", {3}, indices);
+  test.AddOutput<int64_t>("output", {3, 3}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, FloatType2D) {
+  std::vector<float> input = {1.0f, 1.0f, 3.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f,
+                              0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f};
+  std::vector<int64_t> indices = {1, 3, 5, 7, 9, 11};
+
+  std::vector<float> output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f};
+  std::vector<int64_t> unflatten_dims = {5, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<float>("input", {5, 3}, input);
+  test.AddInput<int64_t>("indices", {6}, indices);
+  test.AddOutput<float>("output", {6}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, FloatType3D) {
+  std::vector<float> input = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
+                              4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f};
+  std::vector<int64_t> indices = {1, 3, 4};
+
+  std::vector<float> output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f, 7.f, 8.f, 9.f};
+  std::vector<int64_t> unflatten_dims = {2, 3};
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<float>("input", {2, 3, 3}, input);
+  test.AddInput<int64_t>("indices", {3}, indices);
+  test.AddOutput<float>("output", {3, 3}, output);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, MLFloat16Type2D) {
+  std::vector<float> input = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f,
+                              0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f};
+  std::vector<int64_t> indices = {1, 3, 5, 7, 9, 11};
+
+  std::vector<float> output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f};
+  std::vector<int64_t> unflatten_dims = {5, 3};
+
+  std::vector<MLFloat16> input_half;
+  input_half.resize(input.size());
+  ConvertFloatToMLFloat16(input.data(), input_half.data(), static_cast<int>(input.size()));
+  std::vector<MLFloat16> output_half;
+  output_half.resize(output.size());
+  ConvertFloatToMLFloat16(output.data(), output_half.data(), static_cast<int>(output.size()));
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<MLFloat16>("input", {5, 3}, input_half);
+  test.AddInput<int64_t>("indices", {6}, indices);
+  test.AddOutput<MLFloat16>("output", {6}, output_half);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+TEST(FlattenAndUnpadTest, MLFloat16Type3D) {
+  std::vector<float> input = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
+                              4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f};
+  std::vector<int64_t> indices = {1, 3, 4};
+
+  std::vector<float> output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f, 7.f, 8.f, 9.f};
+  std::vector<int64_t> unflatten_dims = {2, 3};
+
+  std::vector<MLFloat16> input_half;
+  input_half.resize(input.size());
+  ConvertFloatToMLFloat16(input.data(), input_half.data(), static_cast<int>(input.size()));
+  std::vector<MLFloat16> output_half;
+  output_half.resize(output.size());
+  ConvertFloatToMLFloat16(output.data(), output_half.data(), static_cast<int>(output.size()));
+
+  OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain);
+  test.AddInput<MLFloat16>("input", {2, 3, 3}, input_half);
+  test.AddInput<int64_t>("indices", {3}, indices);
+  test.AddOutput<MLFloat16>("output", {3, 3}, output_half);
+  test.AddOutput<int64_t>("unflatten_dims", {2}, unflatten_dims);
+  test.Run();
+}
+
+#endif
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc b/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc
index a800f17e59ae0..9a86955e09379 100644
--- a/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc
@@ -17,14 +17,11 @@ TEST(PadAndUnflattenTest, FloatType1D) {
   std::vector<float> output = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f,
                                0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<int64_t> full_flatten_dims = {15};
-
   OpTester test("PadAndUnflatten", 1, onnxruntime::kMSDomain);
   test.AddInput<float>("input", {6}, input);
   test.AddInput<int64_t>("indices", {6}, indices);
   test.AddInput<int64_t>("unflatten_dims", {2}, unflatten_dims);
   test.AddOutput<float>("output", {5, 3}, output);
-  test.AddOutput<int64_t>("full_flatten_dims", {1}, full_flatten_dims);
   test.Run();
 }
 
@@ -36,14 +33,11 @@ TEST(PadAndUnflattenTest, FloatType2D) {
   std::vector<float> output = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
                                4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<int64_t> full_flatten_dims = {6, 3};
-
   OpTester test("PadAndUnflatten", 1, onnxruntime::kMSDomain);
   test.AddInput<float>("input", {3, 3}, input);
   test.AddInput<int64_t>("indices", {3}, indices);
   test.AddInput<int64_t>("unflatten_dims", {2}, unflatten_dims);
   test.AddOutput<float>("output", {2, 3, 3}, output);
-  test.AddOutput<int64_t>("full_flatten_dims", {2}, full_flatten_dims);
   test.Run();
 }
 
@@ -55,8 +49,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type1D) {
   std::vector<float> output = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f,
                                0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<int64_t> full_flatten_dims = {15};
-
   std::vector<MLFloat16> input_half;
   input_half.resize(input.size());
   ConvertFloatToMLFloat16(input.data(), input_half.data(), int(input.size()));
@@ -69,7 +61,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type1D) {
   test.AddInput<int64_t>("indices", {6}, indices);
   test.AddInput<int64_t>("unflatten_dims", {2}, unflatten_dims);
   test.AddOutput<MLFloat16>("output", {5, 3}, output_half);
-  test.AddOutput<int64_t>("full_flatten_dims", {1}, full_flatten_dims);
   test.Run();
 }
 
@@ -81,8 +72,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type2D) {
   std::vector<float> output = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
                                4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<int64_t> full_flatten_dims = {6, 3};
-
   std::vector<MLFloat16> input_half;
   input_half.resize(input.size());
   ConvertFloatToMLFloat16(input.data(), input_half.data(), int(input.size()));
@@ -95,7 +84,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type2D) {
   test.AddInput<int64_t>("indices", {3}, indices);
   test.AddInput<int64_t>("unflatten_dims", {2}, unflatten_dims);
   test.AddOutput<MLFloat16>("output", {2, 3, 3}, output_half);
-  test.AddOutput<int64_t>("full_flatten_dims", {2}, full_flatten_dims);
   test.Run();
 }
 
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index eeaa51c4dc1d8..dcf733153bdad 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -207,6 +207,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, FlattenAndUnpad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad);
@@ -462,6 +463,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc
new file mode 100644
index 0000000000000..8bfb10f686771
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad.h"
+#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+ONNX_OPERATOR_KERNEL_EX(
+    FlattenAndUnpad,
+    kMSDomain,
+    1,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraints<int32_t, int64_t, MLFloat16, float, double, BFloat16>())
+        .TypeConstraint("T_INT", DataTypeImpl::GetTensorType<int64_t>())
+        .OutputMemoryType(OrtMemTypeCPUOutput, 1),
+    FlattenAndUnpad);
+
+// Put implementation in the anonymous namespace to avoid name collision in the global namespace.
+namespace {
+
+template <typename T>
+struct FlattenAndUnpadFunctor {
+  void operator()(cudaStream_t stream,
+                  const int64_t output_element_count,
+                  const fast_divmod output_element_stride_fdm,
+                  const int64_t index_value_upper_bound,
+                  const Tensor& input_tensor,
+                  const Tensor& indices_tensor,
+                  Tensor& output_tensor) const {
+    typedef typename ToCudaType<T>::MappedType CudaT;
+    const CudaT* input_data = reinterpret_cast<const CudaT*>(input_tensor.Data<T>());
+
+    FlattenAndUnpadImpl<CudaT>(stream, output_element_count, output_element_stride_fdm, index_value_upper_bound,
+                               input_data, indices_tensor.Data<int64_t>(),
+                               reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()));
+  }
+};
+
+}  // namespace
+
+Status FlattenAndUnpad::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input_tensor = context->Input<Tensor>(0);
+  const Tensor* indices_tensor = context->Input<Tensor>(1);
+  ORT_ENFORCE(input_tensor->Shape().NumDimensions() >= 2,
+              "input_tensor tensor must have at least 2 dimensions.", input_tensor->Shape().NumDimensions());
+  ORT_ENFORCE(indices_tensor->Shape().NumDimensions() == 1,
+              "indices_tensor tensor must be 1-D.", indices_tensor->Shape().NumDimensions());
+
+  const auto& input_shape = input_tensor->Shape();
+  std::vector<int64_t> output_shape_vec;
+  output_shape_vec.reserve(input_shape.NumDimensions() - 1);
+  output_shape_vec.push_back(indices_tensor->Shape()[0]);
+  int64_t element_stride = 1;
+  for (size_t i = 2; i < input_shape.NumDimensions(); ++i) {
+    output_shape_vec.push_back(input_shape[i]);
+    element_stride *= input_shape[i];
+  }
+
+  fast_divmod output_element_stride_fdm(static_cast<int>(element_stride));
+  auto output_shape = TensorShape(output_shape_vec);
+  Tensor* output_tensor = context->Output(0, output_shape);
+
+  std::vector<int64_t> unflatten_dims_vec;
+  unflatten_dims_vec.reserve(2);
+  unflatten_dims_vec.push_back(input_shape[0]);
+  unflatten_dims_vec.push_back(input_shape[1]);
+  const int64_t index_value_upper_bound = input_shape[0] * input_shape[1];
+
+  utils::MLTypeCallDispatcher<int32_t, int64_t, float, MLFloat16, double, BFloat16>
+      t_disp(input_tensor->GetElementType());
+  t_disp.Invoke<FlattenAndUnpadFunctor>(Stream(context),
+                                        output_shape.Size(),
+                                        output_element_stride_fdm,
+                                        index_value_upper_bound,
+                                        *input_tensor,
+                                        *indices_tensor,
+                                        *output_tensor);
+
+  size_t rank = unflatten_dims_vec.size();
+  Tensor* unflatten_dims_tensor = context->Output(1, {static_cast<int>(rank)});
+  TensorShape(unflatten_dims_vec).CopyDims(unflatten_dims_tensor->MutableData<int64_t>(), rank);
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h
new file mode 100644
index 0000000000000..f9c6819a393b8
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+class FlattenAndUnpad final : public CudaKernel {
+ public:
+  FlattenAndUnpad(const OpKernelInfo& info) : CudaKernel(info) {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu
new file mode 100644
index 0000000000000..2091a7082ee79
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu
@@ -0,0 +1,83 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+constexpr int kBlockSize = 256;
+constexpr int kNumUnroll = 4;
+
+template <typename T>
+__global__ void ExtractIputWithIndexKernel(const CUDA_LONG N,
+                                           const fast_divmod output_element_stride_fdm,
+                                           const int64_t index_value_upper_bound,
+                                           const T* input_data,
+                                           const int64_t* indices_data,
+                                           T* output_data) {
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = idx * kNumUnroll;
+
+  T input[kNumUnroll];
+  if (id < N) {
+#pragma unroll
+    for (int i = 0; i < kNumUnroll; ++i) {
+      CUDA_LONG li = id + i;
+      if (li < N) {
+        int row_index, col_index;
+        output_element_stride_fdm.divmod(li, row_index, col_index);
+        assert(indices_data[row_index] < index_value_upper_bound);
+        input[i] = input_data[indices_data[row_index] * output_element_stride_fdm.d_ + col_index];
+      }
+    }
+  }
+
+#pragma unroll
+  for (int i = 0; i < kNumUnroll; ++i) {
+    CUDA_LONG li = id + i;
+    if (li < N) {
+      output_data[li] = input[i];
+    }
+  }
+}
+
+template <typename T>
+void FlattenAndUnpadImpl(cudaStream_t stream,
+                         const int64_t total_element_count,
+                         const fast_divmod output_element_stride_fdm,
+                         const int64_t index_value_upper_bound,
+                         const T* input_data,
+                         const int64_t* indices_data,
+                         T* output_data) {
+  const int blocksPerGrid = static_cast<int>(CeilDiv(total_element_count, kBlockSize * kNumUnroll));
+  ExtractIputWithIndexKernel<T><<<blocksPerGrid, kBlockSize, 0, stream>>>(
+      static_cast<CUDA_LONG>(total_element_count),
+      output_element_stride_fdm,
+      index_value_upper_bound,
+      input_data,
+      indices_data,
+      output_data);
+}
+
+#define FLATTEN_AND_UNPAD_IMPL(T)                                       \
+  template void FlattenAndUnpadImpl<T>(cudaStream_t stream,                         \
+                                       const int64_t total_element_count,           \
+                                       const fast_divmod output_element_stride_fdm, \
+                                       const int64_t index_value_upper_bound,       \
+                                       const T* input_data,                         \
+                                       const int64_t* indices_data,                 \
+                                       T* output_data);
+
+FLATTEN_AND_UNPAD_IMPL(float)
+FLATTEN_AND_UNPAD_IMPL(double)
+FLATTEN_AND_UNPAD_IMPL(half)
+FLATTEN_AND_UNPAD_IMPL(BFloat16)
+FLATTEN_AND_UNPAD_IMPL(int32_t)
+FLATTEN_AND_UNPAD_IMPL(int64_t)
+
+#undef FLATTEN_AND_UNPAD_FROM_MASK_IMPL
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h
new file mode 100644
index 0000000000000..75f8c243d3425
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifdef USE_ROCM
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#else
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+#endif
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+void FlattenAndUnpadImpl(cudaStream_t stream,
+                         const int64_t total_element_count,
+                         const fast_divmod output_element_stride_fdm,
+                         const int64_t index_value_upper_bound,
+                         const T* input_data,
+                         const int64_t* indices_data,
+                         T* output_data);
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc
index caf89ef840e0c..7bd759e8976c1 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc
@@ -17,8 +17,7 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", BuildKernelDefConstraints<MLFloat16, float, double, BFloat16>())
         .TypeConstraint("T_INT", DataTypeImpl::GetTensorType<int64_t>())
         .TypeConstraint("T_INDEX", DataTypeImpl::GetTensorType<int64_t>())
-        .InputMemoryType(OrtMemTypeCPUInput, 2)
-        .OutputMemoryType(OrtMemTypeCPUOutput, 1),
+        .InputMemoryType(OrtMemTypeCPUInput, 2),
     PadAndUnflatten);
 
 // Put implementation in the anonymous namespace to avoid name collision in the global namespace.
@@ -63,14 +62,11 @@ Status PadAndUnflatten::ComputeInternal(OpKernelContext* context) const {
   output_shape_vec.push_back(dims_ptr[0]);
   output_shape_vec.push_back(dims_ptr[1]);
 
-  std::vector<int64_t> full_size_flatten_shape_vec;
   const int64_t flatten_dim_factor = dims_ptr[0] * dims_ptr[1];
-  full_size_flatten_shape_vec.push_back(flatten_dim_factor);
 
   int64_t element_stride = 1;
   for (size_t i = 1; i < input_shape.NumDimensions(); ++i) {
     output_shape_vec.push_back(input_shape[i]);
-    full_size_flatten_shape_vec.push_back(input_shape[i]);
     element_stride *= input_shape[i];
   }
 
@@ -87,11 +83,6 @@ Status PadAndUnflatten::ComputeInternal(OpKernelContext* context) const {
                                         *indices_tensor,
                                         *output_tensor);
 
-  // Set input shape output tensor.
-  size_t rank = full_size_flatten_shape_vec.size();
-  Tensor* input_shape_tensor = context->Output(1, {static_cast<int>(rank)});
-  TensorShape(full_size_flatten_shape_vec).CopyDims(input_shape_tensor->MutableData<int64_t>(), rank);
-
   return Status::OK();
 }
 
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu
index 22a4f518dfa47..e96770f974bf0 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu
@@ -61,7 +61,7 @@ void PadAndUnflattenImpl(cudaStream_t stream,
       output_data);
 }
 
-#define SPECIALIZED_RESTORE_FROM_MASK_IMPL(T)                                       \
+#define PAD_AND_UNFLATTEN_IMPL(T)                                       \
   template void PadAndUnflattenImpl<T>(cudaStream_t stream,                         \
                                        const int64_t total_element_count,           \
                                        const fast_divmod output_element_stride_fdm, \
@@ -70,12 +70,12 @@ void PadAndUnflattenImpl(cudaStream_t stream,
                                        const int64_t* indices_data,                 \
                                        T* output_data);
 
-SPECIALIZED_RESTORE_FROM_MASK_IMPL(float)
-SPECIALIZED_RESTORE_FROM_MASK_IMPL(double)
-SPECIALIZED_RESTORE_FROM_MASK_IMPL(half)
-SPECIALIZED_RESTORE_FROM_MASK_IMPL(BFloat16)
+PAD_AND_UNFLATTEN_IMPL(float)
+PAD_AND_UNFLATTEN_IMPL(double)
+PAD_AND_UNFLATTEN_IMPL(half)
+PAD_AND_UNFLATTEN_IMPL(BFloat16)
 
-#undef SPECIALIZED_RESTORE_FROM_MASK_IMPL
+#undef PAD_AND_UNFLATTEN_FROM_MASK_IMPL
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
index e0749c2fb4d0d..e107a2542fc0b 100644
--- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
@@ -187,6 +187,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, FlattenAndUnpad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad);
@@ -390,6 +391,7 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,

From 7a3da4526f98c9cfc6387a5faa1edeec7d88ef17 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 8 Nov 2023 18:32:12 -0800
Subject: [PATCH 12/13] add bfloat16 support for CUDA Neg kernel (#18306)

### Description
<!-- Describe your changes. -->

Registers BFloat16 datatype as valid input type for CUDA Neg Kernel.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Enabling `meta-llama/Llama-2-70b` to be finetuned with ONNX Runtime
training.

---------

Co-authored-by: Prathik Rao <prathikrao@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
---
 docs/OperatorKernels.md                                  | 2 +-
 .../core/providers/cuda/cuda_execution_provider.cc       | 2 ++
 .../core/providers/cuda/math/unary_elementwise_ops.cc    | 9 ++++++++-
 .../providers/cuda/math/unary_elementwise_ops_impl.cu    | 9 +++++----
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 38783ac044c22..8e546b30aa4cb 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -665,7 +665,7 @@ Do not modify directly.*
 |Mul|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |||[7, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|Neg|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
+|Neg|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
 |NonZero|*in* X:**T**<br> *out* Y:**tensor(int64)**|13+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint8)|
 |||[9, 12]|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 2d242d7d6fb12..d8a0792209b0f 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -971,6 +971,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Neg);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Floor);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Floor);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Floor);
@@ -1855,6 +1856,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Floor)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Floor)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Floor)>,
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index 9ede1f8d90ecc..655877f425054 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -99,6 +99,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
 // F: float
 // D: double
 // O: bool
+// X: BFloat16
 
 #define UNARY_OP_VERSIONED_HFD(name, startver, endver)        \
   UNARY_OP_VERSIONED_TYPED(name, startver, endver, MLFloat16) \
@@ -124,12 +125,18 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
   UNARY_OP_TYPED(name, ver, float)     \
   UNARY_OP_TYPED(name, ver, double)
 
+#define UNARY_OP_HFDX(name, ver)       \
+  UNARY_OP_TYPED(name, ver, MLFloat16) \
+  UNARY_OP_TYPED(name, ver, BFloat16)  \
+  UNARY_OP_TYPED(name, ver, float)     \
+  UNARY_OP_TYPED(name, ver, double)
+
 #define UNARY_OP_CSILHFD(name, ver)  \
   UNARY_OP_TYPED(name, ver, int8_t)  \
   UNARY_OP_TYPED(name, ver, int16_t) \
   UNARY_OP_TYPED(name, ver, int32_t) \
   UNARY_OP_TYPED(name, ver, int64_t) \
-  UNARY_OP_HFD(name, ver)
+  UNARY_OP_HFDX(name, ver)
 
 #define UNARY_OP_BWUZCSILHFD(name, ver) \
   UNARY_OP_TYPED(name, ver, uint8_t)    \
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
index 1298d53338337..5c3db4a499972 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
@@ -53,13 +53,14 @@ UNARY_OPS()
 // F: float
 // D: double
 // O: bool
+// X: BFloat16
 
 #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name) \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, half)     \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, float)    \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, double)
 
-#define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(name) \
+#define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(name) \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name)        \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, BFloat16)
 
@@ -68,7 +69,7 @@ UNARY_OPS()
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int16_t)      \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int32_t)      \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int64_t)      \
-  SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name)
+  SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(name)
 
 #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_BWUZCSILHFD(name) \
   SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, uint8_t)          \
@@ -83,8 +84,8 @@ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Floor)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Ceil)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Reciprocal)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sqrt)
-SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(Log)
-SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(Exp)
+SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Log)
+SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Exp)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Erf)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Round)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sin)

From 55c19d6ab5d49a95f58e66baccdb5f81092e4eeb Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Wed, 8 Nov 2023 20:56:36 -0800
Subject: [PATCH 13/13] [QNN EP] Enable option to set QNN context priority
 (#18315)

Enable option qnn_context_priority to set QNN context priority, options:
"low", "normal", "normal_high", "high".

### Description
Enable option qnn_context_priority to set QNN context priority, options:
"low", "normal", "normal_high", "high".

This feature guarantees the model inference with higher priority. Tested
with onnxruntime_perf_test tool using same model.
1. Run the model on the NPU with single instance, the latency is 300ms.
2. Run the same model on NPU with 2 instance at same time.
   Case 1:
   both with same priority (high ) -- latency is 600ms
   Case 2:
   1 with low priority -- latency is 30,000ms
   1 with high priority --  latency is 300ms
   Case 3:
   1 with normal priority -- latency is 15,000ms
   1 with high priority --  latency is 300ms
---
 .../core/session/onnxruntime_c_api.h          |  1 +
 .../qnn/builder/qnn_backend_manager.cc        | 42 +++++++++++-
 .../qnn/builder/qnn_backend_manager.h         |  4 +-
 .../core/providers/qnn/builder/qnn_def.h      |  8 +++
 .../providers/qnn/qnn_execution_provider.cc   | 66 +++++++++++++------
 .../providers/qnn/qnn_execution_provider.h    |  3 +-
 onnxruntime/test/onnx/main.cc                 |  8 ++-
 .../test/perftest/command_args_parser.cc      |  5 +-
 onnxruntime/test/perftest/ort_test_session.cc |  7 +-
 .../test/providers/qnn/qnn_basic_test.cc      | 16 ++++-
 10 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 1d02b72342722..c7d4a236bcf89 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3604,6 +3604,7 @@ struct OrtApi {
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.
+   *   "qnn_context_priority": QNN context priority, options: "low", "normal", "normal_high", "high". Default to "normal".
    *   "htp_graph_finalization_optimization_mode": Set the optimization mode for graph finalization on the HTP backend. Available options:
    *     - "0": Default.
    *     - "1": Faster preparation time, less optimal graph.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index dd56731ac9f7f..03d6b46c528c3 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -380,15 +380,48 @@ Status QnnBackendManager::ReleaseProfilehandle() {
   return Status::OK();
 }
 
+Status SetQnnContextConfig(ContextPriority context_priority, QnnContext_Config_t& qnn_context_config) {
+  qnn_context_config.option = QNN_CONTEXT_CONFIG_OPTION_PRIORITY;
+  switch (context_priority) {
+    case ContextPriority::LOW: {
+      qnn_context_config.priority = QNN_PRIORITY_LOW;
+      break;
+    }
+    case ContextPriority::NORMAL: {
+      qnn_context_config.priority = QNN_PRIORITY_NORMAL;
+      break;
+    }
+    case ContextPriority::NORMAL_HIGH: {
+      qnn_context_config.priority = QNN_PRIORITY_NORMAL_HIGH;
+      break;
+    }
+    case ContextPriority::HIGH: {
+      qnn_context_config.priority = QNN_PRIORITY_HIGH;
+      break;
+    }
+    case ContextPriority::UNDEFINED: {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid Qnn context priority.");
+    }
+    default:
+      qnn_context_config.priority = QNN_PRIORITY_NORMAL;
+  }  // switch
+
+  return Status::OK();
+}
+
 Status QnnBackendManager::CreateContext() {
   if (true == context_created_) {
     LOGS_DEFAULT(INFO) << "Context created already.";
     return Status::OK();
   }
 
+  QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
+  ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, qnn_context_config));
+  const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr};
+
   auto result = qnn_interface_.contextCreate(backend_handle_,
                                              device_handle_,
-                                             (const QnnContext_Config_t**)&context_config_,
+                                             context_configs,
                                              &context_);
 
   ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context.");
@@ -486,9 +519,14 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 
   ORT_RETURN_IF(nullptr == qnn_interface_.contextCreateFromBinary,
                 "Invalid function pointer for contextCreateFromBinary.");
+
+  QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
+  ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, qnn_context_config));
+  const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr};
+
   rt = qnn_interface_.contextCreateFromBinary(backend_handle_,
                                               device_handle_,
-                                              (const QnnContext_Config_t**)&context_config_,
+                                              context_configs,
                                               static_cast<void*>(buffer),
                                               buffer_length,
                                               &context_,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index de5ccb5a28389..aac82c89d6f49 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -30,11 +30,13 @@ class QnnBackendManager {
                     ProfilingLevel profiling_level,
                     uint32_t rpc_control_latency,
                     HtpPerformanceMode htp_performance_mode,
+                    ContextPriority context_priority,
                     std::string&& qnn_saver_path)
       : backend_path_(backend_path),
         profiling_level_(profiling_level),
         rpc_control_latency_(rpc_control_latency),
         htp_performance_mode_(htp_performance_mode),
+        context_priority_(context_priority),
         qnn_saver_path_(qnn_saver_path) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnBackendManager);
@@ -186,7 +188,6 @@ class QnnBackendManager {
   Qnn_LogHandle_t log_handle_ = nullptr;
   Qnn_DeviceHandle_t device_handle_ = nullptr;
   Qnn_ContextHandle_t context_ = nullptr;
-  QnnContext_Config_t** context_config_ = nullptr;
   ProfilingLevel profiling_level_;
   bool backend_initialized_ = false;
   bool device_created_ = false;
@@ -198,6 +199,7 @@ class QnnBackendManager {
   std::vector<std::string> op_package_paths_;
   uint32_t rpc_control_latency_ = 0;
   HtpPerformanceMode htp_performance_mode_;
+  ContextPriority context_priority_;
   std::string sdk_build_version_ = "";
 #ifdef _WIN32
   std::set<HMODULE> mod_handles_;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index 6080c63b555a8..66154fcf346ee 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -48,6 +48,14 @@ enum class HtpPerformanceMode : uint8_t {
   kHtpBalanced,
 };
 
+enum class ContextPriority : uint8_t {
+  LOW = 0,
+  NORMAL,
+  NORMAL_HIGH,
+  HIGH,
+  UNDEFINED
+};
+
 // Defines the graph optimization strategy used by the HTP backend.
 enum class HtpGraphFinalizationOptimizationMode : uint8_t {
   kDefault = 0,
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 6cb276378a09c..8acd0d68b71d0 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -76,6 +76,26 @@ void QNNExecutionProvider::ParseHtpPerformanceMode(std::string htp_performance_m
   }
 }
 
+void QNNExecutionProvider::ParseQnnContextPriority(std::string context_priority_string) {
+  std::transform(context_priority_string.begin(),
+                 context_priority_string.end(),
+                 context_priority_string.begin(),
+                 [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+  LOGS_DEFAULT(VERBOSE) << "QNN context priority: " << context_priority_string;
+  if (context_priority_string == "low") {
+    context_priority_ = qnn::ContextPriority::LOW;
+  } else if (context_priority_string == "normal") {
+    context_priority_ = qnn::ContextPriority::NORMAL;
+  } else if (context_priority_string == "normal_high") {
+    context_priority_ = qnn::ContextPriority::NORMAL_HIGH;
+  } else if (context_priority_string == "high") {
+    context_priority_ = qnn::ContextPriority::HIGH;
+  } else {
+    context_priority_ = qnn::ContextPriority::UNDEFINED;
+    LOGS_DEFAULT(WARNING) << "QNN context priority: " << context_priority_string << " not valid, set to undefined.";
+  }
+}
+
 void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string) {
   LOGS_DEFAULT(VERBOSE) << "HTP graph finalization optimization mode: "
                         << htp_graph_finalization_opt_mode_string;
@@ -96,16 +116,15 @@ void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::
 
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const SessionOptions* session_options)
-    : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true},
-      runtime_options_(provider_options_map) {
+    : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true} {
   if (session_options) {
     disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
   }
 
   static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
-  auto context_cache_enabled_pos = runtime_options_.find(CONTEXT_CACHE_ENABLED);
-  if (context_cache_enabled_pos != runtime_options_.end()) {
+  auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
+  if (context_cache_enabled_pos != provider_options_map.end()) {
     if (context_cache_enabled_pos->second == "1") {
       context_cache_enabled_ = true;
       LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
@@ -113,25 +132,25 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
-  auto context_cache_path_pos = runtime_options_.find(CONTEXT_CACHE_PATH);
-  if (context_cache_path_pos != runtime_options_.end()) {
+  auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
+  if (context_cache_path_pos != provider_options_map.end()) {
     context_cache_path_ = context_cache_path_pos->second;
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_;
   }
 
   bool qnn_context_embed_mode = true;
   static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
-  auto context_cache_embed_mode_pos = runtime_options_.find(CONTEXT_CACHE_EMBED_MODE);
-  if (context_cache_embed_mode_pos != runtime_options_.end()) {
+  auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
+  if (context_cache_embed_mode_pos != provider_options_map.end()) {
     qnn_context_embed_mode = context_cache_embed_mode_pos->second == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
-  auto backend_path_pos = runtime_options_.find(BACKEND_PATH);
+  auto backend_path_pos = provider_options_map.find(BACKEND_PATH);
 
   std::string backend_path;
-  if (backend_path_pos != runtime_options_.end()) {
+  if (backend_path_pos != provider_options_map.end()) {
     backend_path = backend_path_pos->second;
     LOGS_DEFAULT(VERBOSE) << "Backend path: " << backend_path;
   } else {
@@ -139,46 +158,53 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string PROFILING_LEVEL = "profiling_level";
-  auto profiling_level_pos = runtime_options_.find(PROFILING_LEVEL);
-  if (profiling_level_pos != runtime_options_.end()) {
+  auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
+  if (profiling_level_pos != provider_options_map.end()) {
     ParseProfilingLevel(profiling_level_pos->second);
   }
 
   static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency";
-  auto latency_pos = runtime_options_.find(RPC_CONTROL_LANTENCY);
-  if (latency_pos != runtime_options_.end()) {
+  auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY);
+  if (latency_pos != provider_options_map.end()) {
     rpc_control_latency_ = static_cast<uint32_t>(std::stoul(latency_pos->second));
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency_;
   }
 
   htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode";
-  auto htp_performance_mode_pos = runtime_options_.find(HTP_PERFORMANCE_MODE);
-  if (htp_performance_mode_pos != runtime_options_.end()) {
+  auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE);
+  if (htp_performance_mode_pos != provider_options_map.end()) {
     ParseHtpPerformanceMode(htp_performance_mode_pos->second);
   }
 
   htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
   static const std::string HTP_GRAPH_FINALIZATION_OPT_MODE = "htp_graph_finalization_optimization_mode";
-  auto htp_graph_finalization_opt_mode_pos = runtime_options_.find(HTP_GRAPH_FINALIZATION_OPT_MODE);
-  if (htp_graph_finalization_opt_mode_pos != runtime_options_.end()) {
+  auto htp_graph_finalization_opt_mode_pos = provider_options_map.find(HTP_GRAPH_FINALIZATION_OPT_MODE);
+  if (htp_graph_finalization_opt_mode_pos != provider_options_map.end()) {
     ParseHtpGraphFinalizationOptimizationMode(htp_graph_finalization_opt_mode_pos->second);
   }
 
   // Enable use of QNN Saver if the user provides a path the QNN Saver backend library.
   static const std::string QNN_SAVER_PATH_KEY = "qnn_saver_path";
   std::string qnn_saver_path;
-  auto qnn_saver_path_pos = runtime_options_.find(QNN_SAVER_PATH_KEY);
-  if (qnn_saver_path_pos != runtime_options_.end()) {
+  auto qnn_saver_path_pos = provider_options_map.find(QNN_SAVER_PATH_KEY);
+  if (qnn_saver_path_pos != provider_options_map.end()) {
     qnn_saver_path = qnn_saver_path_pos->second;
     LOGS_DEFAULT(VERBOSE) << "User specified QNN Saver path: " << qnn_saver_path;
   }
 
+  static const std::string QNN_CONTEXT_PRIORITY = "qnn_context_priority";
+  auto qnn_context_priority_pos = provider_options_map.find(QNN_CONTEXT_PRIORITY);
+  if (qnn_context_priority_pos != provider_options_map.end()) {
+    ParseQnnContextPriority(qnn_context_priority_pos->second);
+  }
+
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
       profiling_level_,
       rpc_control_latency_,
       htp_performance_mode_,
+      context_priority_,
       std::move(qnn_saver_path));
   qnn_cache_model_handler_ = std::make_unique<qnn::QnnCacheModelHandler>(qnn_context_embed_mode);
 }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index a01b828531555..cf0bff8890d0c 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -57,13 +57,13 @@ class QNNExecutionProvider : public IExecutionProvider {
                              const logging::Logger& logger);
 
   void ParseHtpPerformanceMode(std::string htp_performance_mode_string);
+  void ParseQnnContextPriority(std::string context_priority_string);
 
   void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string);
 
   void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const;
 
  private:
-  ProviderOptions runtime_options_;
   qnn::ProfilingLevel profiling_level_ = qnn::ProfilingLevel::OFF;
   qnn::HtpPerformanceMode htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -74,6 +74,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   std::string context_cache_path_ = "";
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   std::unique_ptr<qnn::QnnCacheModelHandler> qnn_cache_model_handler_;
+  qnn::ContextPriority context_priority_ = qnn::ContextPriority::NORMAL;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 98646058eec3d..2c0804397cfe8 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -56,6 +56,7 @@ void usage() {
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
       "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
@@ -488,6 +489,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             std::string str = str_stream.str();
             ORT_THROW("Wrong value for htp_performance_mode. select from: " + str);
           }
+        } else if (key == "qnn_context_priority") {
+          std::set<std::string> supported_qnn_context_priority = {"low", "normal", "normal_high", "high"};
+          if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) {
+            ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
+          }
         } else if (key == "qnn_saver_path") {
           // no validation
         } else if (key == "htp_graph_finalization_optimization_mode") {
@@ -502,7 +508,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
         } else {
           ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
 'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path',
-'htp_graph_finalization_optimization_mode'])");
+'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 72472e5798792..a72a0d105eefc 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -34,8 +34,8 @@ namespace perftest {
       "\t-A: Disable memory arena\n"
       "\t-I: Generate tensor input binding (Free dimensions are treated as 1.)\n"
       "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
-      "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|snpe|rocm|migraphx|xnnpack|vitisai]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
-      "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'snpe', 'rocm', 'migraphx', 'xnnpack' or 'vitisai'. "
+      "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
+      "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack' or 'vitisai'. "
       "Default:'cpu'.\n"
       "\t-b [tf|ort]: backend to use. Default:ort\n"
       "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n"
@@ -71,6 +71,7 @@ namespace perftest {
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
       "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
       "\t    '0', '1', '2', '3', default is '0'.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index f3ea188043dbe..c2dd81ec9f359 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -367,10 +367,15 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           std::string str = str_stream.str();
           ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
         }
+      } else if (key == "qnn_context_priority") {
+        std::set<std::string> supported_qnn_context_priority = {"low", "normal", "normal_high", "high"};
+        if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) {
+          ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
+        }
       } else {
         ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
 'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path',
-'htp_graph_finalization_optimization_mode'])");
+'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
       }
 
       qnn_options[key] = value;
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 02ff834169b2b..2e2acb36e8071 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -174,7 +174,8 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
 // shape inferencing issues on QNN. Thus, the models are expected to have a specific input/output
 // types and shapes.
 static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false,
-                               std::string htp_graph_finalization_opt_mode = "") {
+                               std::string htp_graph_finalization_opt_mode = "",
+                               std::string qnn_context_priority = "") {
   Ort::SessionOptions so;
 
   // Ensure all type/shape inference warnings result in errors!
@@ -199,6 +200,10 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo
     options["htp_graph_finalization_optimization_mode"] = std::move(htp_graph_finalization_opt_mode);
   }
 
+  if (!qnn_context_priority.empty()) {
+    options["qnn_context_priority"] = std::move(qnn_context_priority);
+  }
+
   so.AppendExecutionProvider("QNN", options);
 
   Ort::Session session(*ort_env, ort_model_path, so);
@@ -322,6 +327,15 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
   }
 }
 
+// Test that models run with high QNN context priority.
+TEST_F(QnnHTPBackendTests, QnnContextPriorityHigh) {
+  RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                     true,     // use_htp
+                     false,    // enable_qnn_saver
+                     "",       // htp_graph_finalization_opt_mode
+                     "high");  // qnn_context_priority
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)