From 8044e5f60332940217576a067d99b454a83c5457 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 8 Nov 2023 00:42:55 -0800 Subject: [PATCH 01/13] SDXL: Update demo with dynamic shape serving with CUDA EP (#18340) Update the SDXL demo with dynamic shape serving with CUDA EP. --- .../stable_diffusion/demo_txt2img_xl.py | 103 ++++++++++++++---- .../models/stable_diffusion/demo_utils.py | 2 +- .../stable_diffusion/diffusion_models.py | 23 +++- .../pipeline_stable_diffusion.py | 9 +- 4 files changed, 107 insertions(+), 30 deletions(-) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py index 16e776a08282c..0b529875a2fe7 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py @@ -29,17 +29,7 @@ from pipeline_txt2img_xl import Txt2ImgXLPipeline -def run_demo(): - """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image.""" - - args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo") - - prompt, negative_prompt = repeat_prompt(args) - - # Recommend image size as one of those used in training (see Appendix I in https://arxiv.org/pdf/2307.01952.pdf). - image_height = args.height - image_width = args.width - +def load_pipelines(args, batch_size): # Register TensorRT plugins engine_type = get_engine_type(args.engine) if engine_type == EngineType.TRT: @@ -49,19 +39,18 @@ def run_demo(): max_batch_size = 16 if (engine_type in [EngineType.ORT_TRT, EngineType.TRT]) and ( - args.build_dynamic_shape or image_height > 512 or image_width > 512 + args.build_dynamic_shape or args.height > 512 or args.width > 512 ): max_batch_size = 4 - batch_size = len(prompt) if batch_size > max_batch_size: raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.") # No VAE decoder in base when it outputs latent instead of image. - base_info = PipelineInfo(args.version, use_vae=False) + base_info = PipelineInfo(args.version, use_vae=False, min_image_size=640, max_image_size=1536) base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size) - refiner_info = PipelineInfo(args.version, is_refiner=True) + refiner_info = PipelineInfo(args.version, is_refiner=True, min_image_size=640, max_image_size=1536) refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size) if engine_type == EngineType.TRT: @@ -77,7 +66,13 @@ def run_demo(): enable_vae_slicing = True if enable_vae_slicing: refiner.backend.enable_vae_slicing() + return base, refiner + +def run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False): + image_height = args.height + image_width = args.width + batch_size = len(prompt) base.load_resources(image_height, image_width, batch_size) refiner.load_resources(image_height, image_width, batch_size) @@ -112,10 +107,14 @@ def run_base_and_refiner(warmup=False): # inference once to get cuda graph _, _ = run_base_and_refiner(warmup=True) - print("[I] Warming up ..") + if args.num_warmup_runs > 0: + print("[I] Warming up ..") for _ in range(args.num_warmup_runs): _, _ = run_base_and_refiner(warmup=True) + if is_warm_up: + return + print("[I] Running StableDiffusion XL pipeline") if args.nvtx_profile: cudart.cudaProfilerStart() @@ -123,14 +122,80 @@ def run_base_and_refiner(warmup=False): if args.nvtx_profile: cudart.cudaProfilerStop() - base.teardown() - print("|------------|--------------|") print("| {:^10} | {:>9.2f} ms |".format("e2e", latency)) print("|------------|--------------|") + + +def run_demo(args): + """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image.""" + + prompt, negative_prompt = repeat_prompt(args) + batch_size = len(prompt) + base, refiner = load_pipelines(args, batch_size) + run_pipelines(args, base, refiner, prompt, negative_prompt) + base.teardown() + refiner.teardown() + + +def run_dynamic_shape_demo(args): + """Run demo of generating images with different size with list of prompts with ORT CUDA provider.""" + args.engine = "ORT_CUDA" + args.scheduler = "UniPC" + args.denoising_steps = 8 + args.disable_cuda_graph = True + + batch_size = args.repeat_prompt + base, refiner = load_pipelines(args, batch_size) + + image_sizes = [ + (1024, 1024), + (1152, 896), + (896, 1152), + (1216, 832), + (832, 1216), + (1344, 768), + (768, 1344), + (1536, 640), + (640, 1536), + ] + + # Warm up the pipelines. This only need once before serving. + args.prompt = ["warm up"] + args.num_warmup_runs = 3 + prompt, negative_prompt = repeat_prompt(args) + for height, width in image_sizes: + args.height = height + args.width = width + print(f"\nWarm up pipelines for Batch_size={batch_size}, Height={height}, Width={width}") + run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True) + + # Run pipeline on a list of prompts. + prompts = [ + "starry night over Golden Gate Bridge by van gogh", + "little cute gremlin sitting on a bed, cinematic", + ] + args.num_warmup_runs = 0 + for example_prompt in prompts: + args.prompt = [example_prompt] + prompt, negative_prompt = repeat_prompt(args) + + for height, width in image_sizes: + args.height = height + args.width = width + print(f"\nBatch_size={batch_size}, Height={height}, Width={width}, Prompt={example_prompt}") + run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False) + + base.teardown() refiner.teardown() if __name__ == "__main__": coloredlogs.install(fmt="%(funcName)20s: %(message)s") - run_demo() + + args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo") + no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0] + if no_prompt: + run_dynamic_shape_demo(args) + else: + run_demo(args) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py index e65efd2c53839..d0e4e3adefbc3 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py @@ -78,7 +78,7 @@ def parse_arguments(is_xl: bool, description: str): help="Root Directory to store torch or ONNX models, built engines and output images etc.", ) - parser.add_argument("prompt", nargs="+", help="Text prompt(s) to guide image generation.") + parser.add_argument("prompt", nargs="*", default=[""], help="Text prompt(s) to guide image generation.") parser.add_argument( "--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation." diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py index 4a2e9eb3443da..d93ca8dba7fa0 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py @@ -82,12 +82,21 @@ def infer_shapes(self): class PipelineInfo: - def __init__(self, version: str, is_inpaint: bool = False, is_refiner: bool = False, use_vae=False): + def __init__( + self, + version: str, + is_inpaint: bool = False, + is_refiner: bool = False, + use_vae=False, + min_image_size=256, + max_image_size=1024, + ): self.version = version self._is_inpaint = is_inpaint self._is_refiner = is_refiner self._use_vae = use_vae - + self._min_image_size = min_image_size + self._max_image_size = max_image_size if is_refiner: assert self.is_xl() @@ -187,6 +196,12 @@ def unet_embedding_dim(self): else: raise ValueError(f"Invalid version {self.version}") + def min_image_size(self): + return self._min_image_size + + def max_image_size(self): + return self._max_image_size + class BaseModel: def __init__( @@ -209,8 +224,8 @@ def __init__( self.min_batch = 1 self.max_batch = max_batch_size - self.min_image_shape = 256 # min image resolution: 256x256 - self.max_image_shape = 1024 # max image resolution: 1024x1024 + self.min_image_shape = pipeline_info.min_image_size() + self.max_image_shape = pipeline_info.max_image_size() self.min_latent_shape = self.min_image_shape // 8 self.max_latent_shape = self.max_image_shape // 8 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index e28db2b77105a..e34fab1218b21 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -163,12 +163,9 @@ def is_backend_tensorrt(self): return self.engine_type == EngineType.TRT def set_denoising_steps(self, denoising_steps: int): - if self.denoising_steps != denoising_steps: - assert self.denoising_steps is None # TODO(tianleiwu): support changing steps in different runs - # Pre-compute latent input scales and linear multistep coefficients - self.scheduler.set_timesteps(denoising_steps) - self.scheduler.configure() - self.denoising_steps = denoising_steps + self.scheduler.set_timesteps(denoising_steps) + self.scheduler.configure() + self.denoising_steps = denoising_steps def load_resources(self, image_height, image_width, batch_size): # If engine is built with static input shape, call this only once after engine build. From 2151c79bf190cdd71794d224f8efea4f8b9cd207 Mon Sep 17 00:00:00 2001 From: pengwa Date: Wed, 8 Nov 2023 17:42:50 +0800 Subject: [PATCH 02/13] Tune ORTModule logging experience a bit (#18298) ### Tune logging experience a bit After last time we update the ORTModule log experience, we found few issues: 1. `INFO` level output too many things, including PyTorch exporter verbose logs (tracing graphs) on every ranks. On this level, we only want to - Output a little bit more information to Users than `WARNING` level, for example the memory recomputation recommendations or other not-fully-ready features. - Output a little bit more information for a quick diagnostic, collected on rank-0 only. 2. ONNX Runtime logging filter during graph build, session init sometimes will hide the issues (for example segement fault), there is no useful information in `WARNING`/`INFO` for users to report to us. This is not good! 3. Some of our devs like using `pdb` to debug Python code, but if we add `import pdb; pdb.set_trace()` in models' code might hang when they use `INFO` or `WARNING`, where exporter happens and all output got redirected due to log filtering. The only workaround is to switch to VERBOSE, which output toooooooooooo many logs. The corresponding changes proposed here are: 1. For `INFO` logging, - We only logs rank-0. - We restricted the ORT backend logging level to be WARNING in this case, because ORT backend code output way too many logs that should be under verbose, while we cannot guarantee we can get them cleaned up immediately once they are added. - We output the PyTorch exporter verbose log (including tracing graph), which is useful for a quick diagnostic when an issue happens. 2. Remove all logging filtering on ORT backend, then the segment fault issue details will not be hidden once it happens again. 3. Introduced a `DEVINFO` logging, - Log logs on all ranks - Log ORT backend logging level INFO - PyTorch exporter logging filtering are all turned OFF (to unblock the pdb debugging). 4. Currently, to use Memory Optimizer, need use DEVINFO (which will output ORT backend INFO log). So update memory optimizer document to reflect this. https://github.com/microsoft/onnxruntime/pull/17481 will update the requirement back to INFO for show memory optimization infos. You can check https://github.com/microsoft/onnxruntime/blob/pengwa/devinfo_level/docs/ORTModule_Training_Guidelines.md#log-level-explanations for a better view of different log levels. This PR also extract some changes from a bigger one https://github.com/microsoft/onnxruntime/pull/17481, to reduce its complexity for review. ### Motivation and Context --------- Co-authored-by: mindest <30493312+mindest@users.noreply.github.com> --- docs/Memory_Optimizer.md | 8 +- docs/ORTModule_Training_Guidelines.md | 84 +++++++++++++++++++ .../ortmodule/_graph_execution_manager.py | 1 - .../training/ortmodule/_inference_manager.py | 4 +- .../python/training/ortmodule/_logger.py | 19 +++-- .../training/ortmodule/_training_manager.py | 4 +- .../python/training/ortmodule/options.py | 5 -- 7 files changed, 101 insertions(+), 24 deletions(-) diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md index 3ef3a575f20a1..e9ceae00a684d 100644 --- a/docs/Memory_Optimizer.md +++ b/docs/Memory_Optimizer.md @@ -20,10 +20,10 @@ Not all models and recipes need this optimizer technique. Imagine if your traini ## Quick trial 1. Make sure ONNX Runtime training wheel is installed and correctly configured. -2. Integrate models using `ORTModule`, be noted log_level should be equal or lower than INFO. - > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.INFO)) -3. Run the training as usual and redirect all outputs into log file; then stop it after training few steps. -4. Check the logging file, search "Summary", you could possibly find something like this: +2. Integrate models using `ORTModule`, be noted log_level should be equal to or lower than DEVINFO. + > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.DEVINFO)) +3. Run the training as usual and redirect all outputs into the log file; then stop it after training a few steps. +4. Check the logging file, and search "Summary", you could find something like this: ``` MemoryOptimizer Summary: User config: diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md index 5350988b20964..12733c3551704 100644 --- a/docs/ORTModule_Training_Guidelines.md +++ b/docs/ORTModule_Training_Guidelines.md @@ -49,6 +49,90 @@ More options for **developers**. ``` Check [DebugOptions implementation](../orttraining/orttraining/python/training/ortmodule/options.py) for more details. +#### Log Level Explanations + + + + + + + + + + + + + + + + + + + + + + + + +
Log LevelDescription
+ +`FATAL` | `ERROR` | `WARNING` (For Users) + +`WARNING` is the default and recommended level for +
users.
+
+ +- ONNX Runtime backend log level - `FATAL` | `ERROR` | `WARNING`. +- ORTModule log level - `FATAL` | `ERROR` | `WARNING`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `ON`. + +
+ +`INFO` (For Users | ORT Developers) + +`INFO` is used for collecting experimental +
feature stats, or a little bit more error messages.
+
+ +- ONNX Runtime backend log level - `WARNING`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`DEVINFO` (For ORT Developers) + +`DEVINFO` is the recommended level for +
debugging purposes.
+
+ +- ONNX Runtime backend log level - `INFO`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`VERBOSE` (For ORT Developers) + +`VERBOSE` is the last resort for debugging +
hard problems.
+
+ +- ONNX Runtime backend log level - `VERBOSE`. +- ORTModule log level - `VERBOSE`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ + ### 2.1 Environment Variables `ORTModule` provides environment variables targeting different use cases. diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 04820218b7c49..5eb1d9f382380 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -473,7 +473,6 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati return graph_transformer_config @_logger.TrackTime(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) - @_logger.SuppressLogs(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) def _initialize_graph_builder(self): """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder""" diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py index 8d8be81c549d1..6690af9b71bf1 100644 --- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py @@ -16,7 +16,7 @@ from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._utils import save_tuning_results, set_tuning_results from .options import DebugOptions, _SkipCheck @@ -207,7 +207,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an inference graph using the module_graph_builder""" @@ -221,7 +220,6 @@ def _build_graph(self, graph_transformer_config): ) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates an InferenceAgent that can run forward graph on an inference model""" diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py index e075ced8eaac2..0728ebdf19af8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_logger.py +++ b/orttraining/orttraining/python/training/ortmodule/_logger.py @@ -21,15 +21,18 @@ class LogLevel(IntEnum): VERBOSE = 0 - INFO = 1 - WARNING = 2 - ERROR = 3 - FATAL = 4 + DEVINFO = 1 # For ORT developers. + INFO = 2 # For ORT users. + WARNING = 3 + ERROR = 4 + FATAL = 5 ORTMODULE_LOG_LEVEL_MAP: Dict[LogLevel, List[int]] = { LogLevel.VERBOSE: [Severity.VERBOSE, logging.DEBUG], - LogLevel.INFO: [Severity.INFO, logging.INFO], + LogLevel.DEVINFO: [Severity.INFO, logging.INFO], + # ONNX Runtime has too many INFO logs, so we map it to WARNING for a better user experience. + LogLevel.INFO: [Severity.WARNING, logging.INFO], LogLevel.WARNING: [Severity.WARNING, logging.WARNING], LogLevel.ERROR: [Severity.ERROR, logging.ERROR], LogLevel.FATAL: [Severity.FATAL, logging.FATAL], @@ -48,13 +51,13 @@ def configure_ortmodule_logger(log_level: LogLevel) -> logging.Logger: """Configure the logger for ortmodule according to following rules. 1. If multiple processes are used, the rank will be appended to the logger name. - 2. If the log level is greater than info, the logger will be + 2. If the log level is equal to or greater than INFO, the logger will be disabled for non-zero ranks. """ rank_info = f".rank-{get_rank()}" if get_world_size() > 1 else "" logger = logging.getLogger(f"orttraining{rank_info}") - # Disable the logger for non-zero ranks when level > info - logger.disabled = log_level > LogLevel.INFO and get_rank() != 0 + # Disable the logger for non-zero ranks when level >= INFO + logger.disabled = log_level >= LogLevel.INFO and get_rank() != 0 logger.setLevel(ortmodule_loglevel_to_python_loglevel(log_level)) return logger diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index e0f11e5aa407e..bafb64235546b 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -18,7 +18,7 @@ from ._gradient_accumulation_manager import GradientAccumulationManager from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import _FlattenedModule, _InputInfo, unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._runtime_inspector import Phase from ._utils import save_tuning_results, set_tuning_results from .graph_optimizer_registry import GraphOptimizerRegistry @@ -358,7 +358,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an optimized gradient graph using the module_graph_builder""" @@ -401,7 +400,6 @@ def _build_graph(self, graph_transformer_config): self._gradient_map.append(-1) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates a TrainingAgent that can run the forward and backward graph on the training model""" diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py index ff0cde37195cb..cddd9cd440b28 100644 --- a/orttraining/orttraining/python/training/ortmodule/options.py +++ b/orttraining/orttraining/python/training/ortmodule/options.py @@ -167,11 +167,6 @@ def torch_exporter_filter(self): @property def onnxruntime_log_filter(self): """Accessor for the filter onnxruntime logs configuration.""" - if self.log_level >= LogLevel.INFO: - return [ - "CleanUnusedInitializersAndNodeArgs] Removing initializer", - "Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED", - ] return None From 34f77eaa243ed16bbcea8fa585c9f89539488b27 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Wed, 8 Nov 2023 08:40:02 -0800 Subject: [PATCH 03/13] bfloat16 support for quickgelugrad (#18336) ### Description Registers BFloat16 datatype as valid input type for CUDA QuickGeluGrad Kernel. ### Motivation and Context Enabling `meta-llama/Llama-2-70b` to be finetuned with ONNX Runtime training. --------- Co-authored-by: Prathik Rao --- .../training_ops/cuda/activation/activations_grad.cc | 6 +++++- .../training_ops/cuda/activation/activations_grad_impl.cu | 7 ++++--- .../orttraining/training_ops/cuda/cuda_training_kernels.cc | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc index 7fde69d758ca9..98e3b878c9e0e 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc @@ -43,11 +43,15 @@ namespace cuda { ACTIVATION_GRAD_OP_TYPED(name, ver, domain, float) \ ACTIVATION_GRAD_OP_TYPED(name, ver, domain, double) +#define ACTIVATION_GRAD_OP_HFDX(name, ver, domain) \ + ACTIVATION_GRAD_OP_HFD(name, ver, domain) \ + ACTIVATION_GRAD_OP_TYPED(name, ver, domain, BFloat16) + ACTIVATION_GRAD_OP_HFD(GeluGrad, 1, kMSDomain); ACTIVATION_GRAD_OP_HFD(FastGeluGrad, 1, kMSDomain); ACTIVATION_GRAD_OP_HFD(ReluGrad, 1, kMSDomain); ACTIVATION_GRAD_OP_HFD(SigmoidGrad, 1, kMSDomain); -ACTIVATION_GRAD_OP_HFD(QuickGeluGrad, 1, kMSDomain); +ACTIVATION_GRAD_OP_HFDX(QuickGeluGrad, 1, kMSDomain); ACTIVATION_GRAD_OP_HFD(TanhGrad, 1, kMSDomain); ACTIVATION_GRAD_OP_HFD(LeakyReluGrad, 1, kMSDomain); diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu index 164aba866722e..dd6a44b9e3b56 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu @@ -83,14 +83,15 @@ struct OP_LeakyReluGrad : public CtxLeakyReluGrad { #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(name, T) \ template void Impl_##name(cudaStream_t stream, const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count); -#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(x) \ +#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFDX(x) \ SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, half) \ SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, float) \ - SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, double) + SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, double) \ + SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, BFloat16) #define ACTIVATION_GRAD_OP_NAME(name) \ BINARY_ELEMENTWISE_IMPL(name); \ - SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(name) + SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFDX(name) ACTIVATION_GRAD_OPS() #undef ACTIVATION_GRAD_OP_NAME diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc index ae4f48b6b49a2..eeaa51c4dc1d8 100644 --- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc @@ -121,6 +121,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGeluGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGeluGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, QuickGeluGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, TanhGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, TanhGrad); @@ -378,6 +379,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From c8def0cc5193783492c0ed153f0d71527a80857e Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Wed, 8 Nov 2023 09:36:28 -0800 Subject: [PATCH 04/13] Add LLaMA GQA ragged batching (#18337) This PR updates replacing MHA with GQA and updates the LLaMA scripts for the modified GQA op. It is related to the changes in [this PR](https://github.com/microsoft/onnxruntime/pull/18283). ### Motivation and Context This PR allows us to run LLaMA with the GQA op end-to-end using ragged batching (i.e. batched inputs of different lengths). --- .../tools/transformers/convert_generation.py | 119 +++++++++++++----- .../tools/transformers/models/llama/README.md | 97 ++++++++++++-- .../transformers/models/llama/benchmark.py | 22 ++-- .../models/llama/convert_to_onnx.py | 43 ++++--- .../transformers/models/llama/llama_inputs.py | 47 +++---- .../transformers/models/llama/llama_parity.py | 29 +++-- .../transformers/models/llama/llama_torch.py | 6 +- 7 files changed, 257 insertions(+), 106 deletions(-) diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py index 7aca5e8526a23..b59af41c49df7 100644 --- a/onnxruntime/python/tools/transformers/convert_generation.py +++ b/onnxruntime/python/tools/transformers/convert_generation.py @@ -1272,39 +1272,96 @@ def find_past_seq_len_usage(subg: GraphProto): return tensor_names_to_rename, nodes_to_remove -def replace_mha_with_gqa(model: OnnxModel, past_seq_len_input: str, kv_num_heads: int = 0, world_size: int = 1): - past_seq_len = past_seq_len_input - if past_seq_len not in model.get_graphs_input_names(): - # Add model input for past sequence length - new_input = onnx.helper.make_tensor_value_info(past_seq_len, onnx.TensorProto.INT64, shape=[1]) - model.model.graph.input.append(new_input) +def replace_mha_with_gqa(model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1): + # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes + # + # attention_mask + # / \ + # ReduceSum Shape + # | | + # Sub Gather + # | | + # seqlens_k total_sequence_length + # | | + # Cast to int32 Cast to int32 + + model.add_initializer( + onnx.helper.make_tensor( + name="one", + data_type=TensorProto.INT64, + dims=[1], + vals=[1], + ) + ) + reduce_sum_node = onnx.helper.make_node( + "ReduceSum", + inputs=[attn_mask, "one"], + outputs=[attn_mask + "_row_sums"], + name=model.create_node_name("ReduceSum"), + ) + sub_node = onnx.helper.make_node( + "Sub", + inputs=[attn_mask + "_row_sums", "one"], + outputs=["seqlens_k_int64"], + name=model.create_node_name("Sub"), + ) + seqlen_k_cast_node = onnx.helper.make_node( + "Cast", + inputs=["seqlens_k_int64"], + outputs=["seqlens_k"], + name=model.create_node_name("Cast"), + to=TensorProto.INT32, + ) + shape_node = onnx.helper.make_node( + "Shape", + inputs=[attn_mask], + outputs=[attn_mask + "_shape"], + name=model.create_node_name("Shape"), + ) + gather_node = onnx.helper.make_node( + "Gather", + inputs=[attn_mask + "_shape", "one"], + outputs=["total_seq_len_int64"], + name=model.create_node_name("Gather"), + axis=0, + ) + total_seqlen_cast_node = onnx.helper.make_node( + "Cast", + inputs=["total_seq_len_int64"], + outputs=["total_seq_len"], + name=model.create_node_name("Cast"), + to=TensorProto.INT32, + ) + model.model.graph.node.extend( + [reduce_sum_node, sub_node, seqlen_k_cast_node, shape_node, gather_node, total_seqlen_cast_node] + ) # Replace MultiHeadAttention with GroupQueryAttention - for node in model.model.graph.node: - if node.op_type == "MultiHeadAttention": - num_heads_mha = 0 - for att in node.attribute: - if att.name == "num_heads": - num_heads_mha = att.i - gqa_node = onnx.helper.make_node( - "GroupQueryAttention", - inputs=[ - node.input[0], # query - node.input[1], # key - node.input[2], # value - node.input[6], # past_key - node.input[7], # past_value - past_seq_len, # past_sequence_length - ], - outputs=node.output, - name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"), - domain="com.microsoft", - num_heads=num_heads_mha // world_size, - kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size, - is_past_bsnh=0, - ) - model.model.graph.node.remove(node) - model.model.graph.node.extend([gqa_node]) + mha_nodes = list(filter(lambda node: node.op_type == "MultiHeadAttention", model.model.graph.node)) + for node in mha_nodes: + num_heads_mha = 0 + for att in node.attribute: + if att.name == "num_heads": + num_heads_mha = att.i + gqa_node = onnx.helper.make_node( + "GroupQueryAttention", + inputs=[ + node.input[0], # query + node.input[1], # key + node.input[2], # value + node.input[6], # past_key + node.input[7], # past_value + "seqlens_k", # seqlens_k (for attention_mask) + "total_seq_len", # total_seq_len (for attention_mask) + ], + outputs=node.output, + name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"), + domain="com.microsoft", + num_heads=num_heads_mha // world_size, + kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size, + ) + model.model.graph.node.remove(node) + model.model.graph.node.extend([gqa_node]) return model diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md index 1bb6940d1cd74..0c6f830ed26b0 100644 --- a/onnxruntime/python/tools/transformers/models/llama/README.md +++ b/onnxruntime/python/tools/transformers/models/llama/README.md @@ -117,7 +117,7 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32-cpu --precision fp32 --execution_provider cpu ``` -Export for FP16 CUDA +Export for FP16 CUDA (with MultiHeadAttention) ``` # From source: $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda @@ -126,6 +126,63 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda ``` +Export for FP16 CUDA (with GroupQueryAttention) +``` +# From source: +$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --use_gqa + +# From wheel: +$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --use_gqa +``` + +Note: GroupQueryAttention currently runs on Linux for FP16 CUDA and INT4 CUDA models, and it can provide faster inference than MultiHeadAttention, especially for large sequence lengths (e.g. 1024 or larger). For the best performance, you should pre-allocate the KV cache buffers to have size `(batch_size, num_heads, max_sequence_length, head_size)` so that the past KV and present KV caches share the same memory. You also need to bind them with ONNX Runtime's [IO binding](https://onnxruntime.ai/docs/api/python/api_summary.html#iobinding). + +Here is an example of how you can bind directly to `torch.tensor` objects: +``` +# Assumes all inputs and outputs to the model are pre-allocated with the correct shapes in GPU memory + +# Bind inputs +for k, v in inputs.items(): + io_binding.bind_input( + name=k, + device_type="cuda", + device_id=0, + element_type=np.float16, + shape=tuple(v.shape), + buffer_ptr=v.data_ptr() + ) + +# Bind outputs +for output in model.get_outputs(): + name = output.name + if "present" in name: + # Bind KV cache outputs to KV cache inputs + v = inputs[name.replace("present", "past_key_values")] + io_binding.bind_output( + name=name, + device_type="cuda", + device_id=0, + element_type=np.float16, + shape=tuple(v.shape), + buffer_ptr=v.data_ptr() + ) + else: + # Bind other outputs as actual outputs + v = outputs[name] + io_binding.bind_output( + name=name, + device_type="cuda", + device_id=0, + element_type=np.float16, + shape=tuple(v.shape), + buffer_ptr=v.data_ptr() + ) + +io_binding.synchronize_inputs() +sess.run_with_iobinding(io_binding) +io_binding.synchronize_outputs() +``` + Export for INT8 CPU (SmoothQuant) ``` # From source: @@ -149,12 +206,14 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama Export for INT4 CUDA ``` # From source: -$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda +$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda --use_gqa # From wheel: -$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda +$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda --use_gqa ``` +Note: See the FP16 CUDA notes about GroupQueryAttention. The `--use_gqa` flag is optional. + Export for INT4 CPU ``` # From source: @@ -168,13 +227,13 @@ Export LLaMA-2 70B sharded model into 4 partitions ``` # From source: # 1. Install necessary packages from requirements-70b-model.txt +$ pip install -r requirements-70b-model.txt # 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command: -$ ./build.sh --config RelWithDebInfo --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/ +$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/ # 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command: -$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-dis --precision fp16 --execution_provider cuda - +$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa ``` ## Benchmark LLaMA-2 @@ -220,7 +279,20 @@ python3 -m models.llama.benchmark \ --auth ``` -4. ONNX Runtime, FP32, Microsoft custom export +4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx +``` +python3 -m models.llama.benchmark \ + --benchmark-type hf-ort \ + --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \ + --model-name meta-llama/Llama-2-7b-hf \ + --precision fp16 \ + --batch-sizes "1 2" \ + --sequence-lengths "8 16" \ + --device cuda \ + --auth +``` + +5. ONNX Runtime, FP32, Microsoft custom export ``` python3 -m models.llama.benchmark \ --benchmark-type ort-msft \ @@ -232,7 +304,7 @@ python3 -m models.llama.benchmark \ --device cpu ``` -5. ONNX Runtime, FP16, Microsoft custom export +6. ONNX Runtime, FP16, Microsoft custom export ``` python3 -m models.llama.benchmark \ --benchmark-type ort-msft \ @@ -244,7 +316,7 @@ python3 -m models.llama.benchmark \ --device cuda ``` -6. ONNX Runtime, FP32, convert_to_onnx, use 2nd GPU +7. ONNX Runtime, FP32, convert_to_onnx, use 2nd GPU ``` CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \ --benchmark-type ort-convert-to-onnx \ @@ -256,7 +328,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \ --device cpu ``` -7. ONNX Runtime, FP16, convert_to_onnx, use 5th GPU +8. ONNX Runtime, FP16, convert_to_onnx, use 5th GPU ``` CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \ --benchmark-type ort-convert-to-onnx \ @@ -283,5 +355,8 @@ python3 -m models.llama.benchmark_all \ --precision fp16 \ --batch-sizes "1 2" \ --sequence-lengths "8 16" \ - --device cuda + --device cuda \ + --warmup-runs 5 \ + --num-runs 1000 \ + --timeout 60 # number of minutes before moving to the next benchmark ``` diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py index be678931de5d1..021b0dd03a9db 100644 --- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py @@ -11,6 +11,7 @@ import onnx import psutil import torch +from benchmark_helper import measure_memory, setup_logger from dist_settings import get_rank, get_size from llama_inputs import ( add_io_bindings, @@ -22,10 +23,9 @@ from optimum.onnxruntime import ORTModelForCausalLM from torch.profiler import ProfilerActivity, profile, record_function from tqdm import trange -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer import onnxruntime as ort -from onnxruntime.transformers.benchmark_helper import measure_memory, setup_logger logger = logging.getLogger(__name__) @@ -107,6 +107,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): past_seq_len=0, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, engine="pt", return_dict=True, ) @@ -118,6 +119,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): past_seq_len=args.sequence_length, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, engine="pt", return_dict=True, ) @@ -132,6 +134,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): past_seq_len=0, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, engine="ort", return_dict=True, world_size=args.world_size, @@ -144,6 +147,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): past_seq_len=args.sequence_length, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, engine="ort", return_dict=True, world_size=args.world_size, @@ -160,6 +164,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): seq_len=args.sequence_length, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, split_kv=split_kv, ) iter_inputs = get_msft_sample_inputs( @@ -169,6 +174,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): seq_len=1, max_seq_len=max_seq_len, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, split_kv=split_kv, ) @@ -192,7 +198,7 @@ def get_model(args: argparse.Namespace): if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}: source = args.hf_pt_dir_path if args.hf_pt_dir_path else args.model_name start_time = time.time() - model = LlamaForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( source, torch_dtype=torch.float16 if args.use_fp16 else torch.float32, use_auth_token=args.auth, @@ -456,7 +462,7 @@ def prepare_ort_inputs(inputs, kv_cache_ortvalues): # Add IO bindings for non-CPU execution providers if args.device != "cpu": io_binding, kv_cache_ortvalues = add_io_bindings( - model, inputs, args.device, int(args.rank), kv_cache_ortvalues + model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues ) setattr(args, "io_binding", io_binding) # noqa: B010 return io_binding, kv_cache_ortvalues @@ -650,8 +656,8 @@ def main(): args.rank = rank args.world_size = world_size - tokenizer = LlamaTokenizer.from_pretrained(args.model_name) - config = LlamaConfig.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + config = AutoConfig.from_pretrained(args.model_name) target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device use_fp16 = args.precision == "fp16" @@ -670,9 +676,9 @@ def main(): gqa_nodes = list(filter(lambda node: node.op_type == "GroupQueryAttention", onnx_model.graph.node)) use_buffer_share = use_fp16 and len(gqa_nodes) > 0 and args.device != "cpu" - setattr(args, "past_present_share_buffer", use_buffer_share) # noqa: B010 + setattr(args, "use_gqa", use_buffer_share) # noqa: B010 else: - setattr(args, "past_present_share_buffer", False) # noqa: B010 + setattr(args, "use_gqa", False) # noqa: B010 # Measure prompt cost (init_inputs) and generated token cost (iter_inputs) for batch_size, sequence_length in itertools.product(args.batch_sizes, args.sequence_lengths): diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index b0e0b41e75d3d..c9c7f4d39d423 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -7,6 +7,8 @@ import onnx import torch +from benchmark_helper import Precision, prepare_environment, setup_logger +from convert_generation import replace_mha_with_gqa from dist_settings import barrier, get_rank, get_size, init_dist from llama_inputs import get_merged_sample_with_past_kv_inputs, get_sample_inputs, get_sample_with_past_kv_inputs from llama_parity import main as parity_check @@ -14,12 +16,10 @@ from onnx_model import OnnxModel from optimizer import optimize_model from packaging import version -from transformers import LlamaConfig, LlamaForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM from onnxruntime import quantization as ort_quantization from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer -from onnxruntime.transformers.benchmark_helper import Precision, prepare_environment, setup_logger -from onnxruntime.transformers.convert_generation import replace_mha_with_gqa logger = logging.getLogger("") init_dist() @@ -133,7 +133,7 @@ def save_onnx_model(onnx_model: onnx.ModelProto, output_path: str, data_path: st # temp_dir.cleanup() # def run_dynamo_export( - args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1 + args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1 ): from torch._dynamo import config @@ -194,7 +194,7 @@ def _prepare_dir(dir_path): def run_torchscript_separate_export( - args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1 + args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1 ): # Dummy values for export batch_size, sequence_length = 2, 8 @@ -313,7 +313,7 @@ def run_torchscript_separate_export( def run_torchscript_merged_export( - args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM, rank: int = 0, world_size: int = 1 + args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1 ): # Dummy values for export batch_size, sequence_length, past_sequence_length = 2, 8, 0 @@ -391,7 +391,7 @@ def run_torchscript_merged_export( # Optimize the model as FP32 -def optimize_export(config: LlamaConfig, input_path: str, output_path: str): +def optimize_export(config: AutoConfig, input_path: str, output_path: str): from fusion_options import FusionOptions optimization_options = FusionOptions("gpt2") @@ -411,7 +411,7 @@ def optimize_export(config: LlamaConfig, input_path: str, output_path: str): def convert_to_float16( - args: argparse.Namespace, config: LlamaConfig, old_paths: List[str], rank: int = 0, world_size: int = 1 + args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1 ): decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx") decoder_with_past_model_fp16_path = os.path.join( @@ -427,7 +427,8 @@ def convert_to_float16( if os.path.exists(fp32_path): model = OnnxModel(onnx.load_model(fp32_path, load_external_data=True)) model.convert_float_to_float16(keep_io_types=False) - model = use_group_query_attention(config, model, world_size) + if args.use_gqa: + model = use_group_query_attention(config, model, world_size) model.save_model_to_file(fp16_path, use_external_data_format=True) del model logger.info(f"The ONNX model at {fp32_path} has been converted to float16 and saved at {fp16_path}!") @@ -437,11 +438,9 @@ def convert_to_float16( return new_paths -def use_group_query_attention(config: LlamaConfig, fp16_model_opt: OnnxModel, world_size: int = 1): - # Replace MultiHeadAttention with GroupQueryAttention and remove attention mask nodes - fp16_model_opt = replace_mha_with_gqa( - fp16_model_opt, "past_sequence_length", config.num_key_value_heads, world_size - ) +def use_group_query_attention(config: AutoConfig, fp16_model_opt: OnnxModel, world_size: int = 1): + # Replace MultiHeadAttention with GroupQueryAttention + fp16_model_opt = replace_mha_with_gqa(fp16_model_opt, "attention_mask", config.num_key_value_heads, world_size) fp16_model_opt.prune_graph() fp16_model_opt.update_graph(allow_remove_graph_inputs=True) return fp16_model_opt @@ -520,8 +519,8 @@ def smooth_quant( logger.info(f"The {args.model_name} ONNX model has been successfully quantized to int8!") - logger.info(f"Removing {args.nc_workspace}") - os.system(f"rm -R {args.nc_workspace}") + logger.warning(f"Removing {args.nc_workspace}") + shutil.rmtree(args.nc_workspace) def remove_existing_model(model_path: str): @@ -594,6 +593,14 @@ def get_args(): ) parser.set_defaults(reexport=False) + parser.add_argument( + "--use_gqa", + required=False, + action="store_true", + help="Use GroupQueryAttention instead of MultiHeadAttention", + ) + parser.set_defaults(use_gqa=False) + parser.add_argument( "--no_merged", required=False, @@ -747,7 +754,7 @@ def main(): location = args.original_model_name if use_auth_token else args.input - # use cuda for Llama-2-70b to speedup export, other models use CPU by default + # Use CUDA for LLaMA-2-70B to speed up export and CPU for other models l_config, llama = setup_torch_model( args, location, use_auth_token, device=args.device if args.model_name == "Llama-2-70b-hf" else None ) @@ -944,6 +951,8 @@ def main(): parity_cmd.append("--use_past_kv") if "merged" in filename: parity_cmd.append("--merged") + if args.use_gqa: + parity_cmd.append("--use_gqa") try: logger.debug(f"check parity with cmd: {parity_cmd}") diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py index 6530eead55f03..bae1ae82e8f7e 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py @@ -2,7 +2,7 @@ import numpy as np import torch -from transformers import LlamaConfig +from transformers import AutoConfig from onnxruntime import InferenceSession, OrtValue @@ -24,7 +24,7 @@ def get_position_ids(attention_mask: torch.Tensor, use_past_kv: bool): # attention_mask: (batch_size, sequence_length) # position_ids: (batch_size, sequence_length) def get_sample_inputs( - config: LlamaConfig, + config: AutoConfig, device: torch.device, batch_size: int, seq_len: int, @@ -59,7 +59,7 @@ def get_sample_inputs( # past_key: (batch_size, num_heads, past_sequence_length, head_size) # past_value: (batch_size, num_heads, past_sequence_length, head_size) def get_sample_with_past_kv_inputs( - config: LlamaConfig, + config: AutoConfig, device: torch.device, batch_size: int, past_seq_len: int, @@ -115,13 +115,14 @@ def get_sample_with_past_kv_inputs( # For models with GQA, kv_sequence_length = max_sequence_length # For models without GQA, kv_sequence_length = past_sequence_length def get_merged_sample_with_past_kv_inputs( - config: LlamaConfig, + config: AutoConfig, device: torch.device, batch_size: int, seq_len: int, past_seq_len: int, max_seq_len: int, use_fp16: bool = False, + use_gqa: bool = False, engine: str = "pt", return_dict: bool = False, world_size: int = 1, @@ -156,9 +157,7 @@ def get_merged_sample_with_past_kv_inputs( assert isinstance(past_kv, dict) inputs.update(past_kv) - if use_fp16: # If model has GQA - del inputs["attention_mask"] - inputs["past_sequence_length"] = np.array([past_seq_len], dtype=np.int64) + if use_gqa: inputs = enable_past_present_share_buffer(inputs, past_seq_len, max_seq_len) else: @@ -170,12 +169,13 @@ def get_merged_sample_with_past_kv_inputs( # Inputs for Microsoft export from https://github.com/microsoft/Llama-2-Onnx def get_msft_sample_inputs( - config: LlamaConfig, + config: AutoConfig, batch_size: int, past_seq_len: int, seq_len: int, max_seq_len: int, use_fp16: bool, + use_gqa: bool, split_kv: bool, ): np_dtype = np.float16 if use_fp16 else np.float32 @@ -213,8 +213,7 @@ def get_msft_sample_inputs( } ) - if use_fp16: # If model has GQA - del ort_inputs["attn_mask"] + if use_gqa: ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len) return ort_inputs @@ -222,7 +221,7 @@ def get_msft_sample_inputs( # Create past_key_values # Each is of shape (batch_size, num_heads, past_sequence_length, head_size) -def get_past_kv_inputs(config: LlamaConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1): +def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1): num_heads, head_size = config.num_key_value_heads // world_size, config.hidden_size // config.num_attention_heads torch_dtype = torch.float16 if use_fp16 else torch.float32 past_kv = [ @@ -247,8 +246,7 @@ def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tenso # Format PyTorch inputs to ONNX Runtime inputs def convert_inputs_for_ort( pt_inputs: dict, - use_fp16: bool, - use_buffer_share: bool = False, + use_gqa: bool = False, past_seq_len: int = 0, max_seq_len: int = 2048, device: str = "", @@ -260,17 +258,11 @@ def convert_inputs_for_ort( ort_inputs[k] = v elif k == "past_key_values": ort_inputs.update(flatten_past_kv_inputs(v)) - elif k == "attention_mask" and use_fp16 and use_buffer_share: - # Skip because FP16 model has GroupQueryAttention, uses buffer sharing, - # and GQA supports a causal mask by default - - # Instead, add the past sequence length input for GQA - ort_inputs["past_sequence_length"] = np.array([past_seq_len], dtype=np.int64) else: ort_inputs[k] = v.detach().cpu().numpy() - # Reshape kv caches if using past-present-share-buffer - if use_buffer_share and device != "" and device != "cpu" and device_id > -1: + # Reshape KV caches if using past-present-share-buffer + if use_gqa and device != "" and device != "cpu" and device_id > -1: ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len) return ort_inputs @@ -289,17 +281,14 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se # Add IO bindings for execution providers -def add_io_bindings(model: InferenceSession, ort_inputs: dict, device: str, device_id: int, kv_cache_ortvalues: dict): - use_fp16 = False +def add_io_bindings( + model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict +): io_binding = model.io_binding() for k, v in ort_inputs.items(): - # Detect if model is in FP16 - if v.dtype == np.float16: - use_fp16 = True - # Bind OrtValue inputs to device - if use_fp16 and ("cache" in k or "past_key_values" in k): + if use_gqa and ("cache" in k or "past_key_values" in k): if k not in kv_cache_ortvalues: v_device = OrtValue.ortvalue_from_numpy(v, device_type=device, device_id=device_id) io_binding.bind_ortvalue_input(k, v_device) @@ -313,7 +302,7 @@ def add_io_bindings(model: InferenceSession, ort_inputs: dict, device: str, devi for output in model.get_outputs(): name = output.name - if use_fp16 and ("out" in name or "present" in name): + if use_gqa and ("out" in name or "present" in name): # Bind present KV cache outputs to past KV cache inputs in order to buffer share input_name = name.replace("out", "cache").replace("present", "past_key_values") io_binding.bind_ortvalue_output(name, kv_cache_ortvalues[input_name]) diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py index 42581caf3bb9e..418a65325c8f0 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py @@ -6,6 +6,7 @@ import numpy as np import torch +from benchmark_helper import setup_logger from dist_settings import get_rank, get_size from llama_inputs import ( add_io_bindings, @@ -15,10 +16,9 @@ get_sample_with_past_kv_inputs, ) from llama_torch import setup_torch_model -from transformers import LlamaConfig, LlamaForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM import onnxruntime as ort -from onnxruntime.transformers.benchmark_helper import setup_logger logger = logging.getLogger("") @@ -30,7 +30,7 @@ def get_sequence_lengths(args: argparse.Namespace): return past_sequence_length, curr_sequence_length, max_sequence_length -def get_inputs(args: argparse.Namespace, config: LlamaConfig): +def get_inputs(args: argparse.Namespace, config: AutoConfig): # Dummy values for parity world_size = get_size() batch_size = 2 @@ -45,6 +45,7 @@ def get_inputs(args: argparse.Namespace, config: LlamaConfig): past_seq_len=past_sequence_length, max_seq_len=max_sequence_length, use_fp16=args.use_fp16, + use_gqa=args.use_gqa, return_dict=True, world_size=world_size, ) @@ -64,7 +65,9 @@ def get_inputs(args: argparse.Namespace, config: LlamaConfig): return inputs -def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: LlamaForCausalLM, kv_cache_ortvalues: dict): +def verify_parity( + args: argparse.Namespace, config: AutoConfig, pt_model: AutoModelForCausalLM, kv_cache_ortvalues: dict +): inputs = get_inputs(args, config) # Run inference with PyTorch @@ -82,8 +85,7 @@ def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: Llama past_sequence_length, _, max_sequence_length = get_sequence_lengths(args) inputs = convert_inputs_for_ort( inputs, - use_fp16=args.use_fp16, - use_buffer_share=args.use_fp16, + use_gqa=args.use_gqa, past_seq_len=past_sequence_length, max_seq_len=max_sequence_length, device=args.execution_provider, @@ -102,7 +104,12 @@ def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: Llama # Add IO bindings for non-CPU execution providers if args.execution_provider != "cpu": io_binding, kv_cache_ortvalues = add_io_bindings( - ort_model, inputs, args.execution_provider, int(args.rank), kv_cache_ortvalues + ort_model, + inputs, + args.execution_provider, + int(args.rank), + args.use_gqa, + kv_cache_ortvalues, ) io_binding.synchronize_inputs() @@ -183,6 +190,14 @@ def get_args(argv: List[str]): ) parser.set_defaults(use_past_kv=False) + parser.add_argument( + "-g", + "--use_gqa", + action="store_true", + help="Use if model has GroupQueryAttention", + ) + parser.set_defaults(use_gqa=False) + parser.add_argument( "--merged", action="store_true", diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py index cf6406dde5be0..94e0397116d1c 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py @@ -3,7 +3,7 @@ import torch from dist_settings import barrier, get_rank, get_size -from transformers import LlamaConfig, LlamaForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM logger = logging.getLogger("") @@ -19,9 +19,9 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32, for i in range(world_size): if i == rank % (world_size): - l_config = LlamaConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir) + l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir) l_config.use_cache = True - llama = LlamaForCausalLM.from_pretrained( + llama = AutoModelForCausalLM.from_pretrained( location, use_auth_token=use_auth_token, config=l_config, From a0eeeafa8050bc354fae279138147f3ea59f83eb Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 8 Nov 2023 10:06:15 -0800 Subject: [PATCH 05/13] [QNN EP] Session option for graph optimization (#18262) ### Description Adds the QNN session option `htp_graph_finalization_optimization_mode` to enable QNN graph optimizations at the expense of longer preparation time. ### Motivation and Context Allow enabling QNN graph optimizations per app/model. --- .../core/session/onnxruntime_c_api.h | 9 ++- .../qnn/builder/onnx_ctx_model_helper.cc | 42 +++++++------- .../qnn/builder/qnn_backend_manager.cc | 35 ++++++++++-- .../qnn/builder/qnn_backend_manager.h | 3 + .../core/providers/qnn/builder/qnn_def.h | 8 +++ .../qnn/builder/qnn_graph_configs_helper.cc | 43 ++++++++++++++ .../qnn/builder/qnn_graph_configs_helper.h | 56 +++++++++++++++++++ .../core/providers/qnn/builder/qnn_model.cc | 5 +- .../core/providers/qnn/builder/qnn_model.h | 3 +- .../providers/qnn/qnn_execution_provider.cc | 44 ++++++++++++++- .../providers/qnn/qnn_execution_provider.h | 6 ++ onnxruntime/test/onnx/main.cc | 14 ++++- .../test/perftest/command_args_parser.cc | 3 + onnxruntime/test/perftest/ort_test_session.cc | 14 ++++- .../test/providers/qnn/qnn_basic_test.cc | 22 +++++++- 15 files changed, 270 insertions(+), 37 deletions(-) create mode 100644 onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc create mode 100644 onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 729a302f3dd0f..1d02b72342722 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3598,12 +3598,17 @@ struct OrtApi { * "rpc_control_latency": QNN RPC control latency. * "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance", * "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default". - * "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the Onnx skeleton model. + * "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model. * 0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context. - * The path is relative path to the Onnx skeleton model file. + * The path is relative path to the ONNX skeleton model file. * "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will * dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and * may alter model/EP partitioning. Use only for debugging. + * "htp_graph_finalization_optimization_mode": Set the optimization mode for graph finalization on the HTP backend. Available options: + * - "0": Default. + * - "1": Faster preparation time, less optimal graph. + * - "2": Longer preparation time, more optimal graph. + * - "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific details. * * SNPE supported keys: * "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16", diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc index 7ccd765e65381..bd9986e661e21 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc @@ -86,32 +86,30 @@ Status QnnCacheModelHandler::GetEpContextFromGraph(const onnxruntime::GraphViewe return qnn_backend_manager->LoadCachedQnnContextFromBuffer(const_cast(context_binary.c_str()), static_cast(context_binary.length()), qnn_model); - } else { - std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, ""); + } - std::string context_binary_path(std::filesystem::path(ctx_onnx_model_path).parent_path().string() + - "/" + external_qnn_context_binary_file_name); - size_t buffer_size{0}; - std::ifstream cache_file(context_binary_path.c_str(), std::ifstream::binary); - ORT_RETURN_IF(!cache_file || !cache_file.good(), "Failed to open cache file."); + std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, ""); - cache_file.seekg(0, cache_file.end); - buffer_size = static_cast(cache_file.tellg()); - ORT_RETURN_IF(0 == buffer_size, "Empty cache file encountered."); + std::string context_binary_path(std::filesystem::path(ctx_onnx_model_path).parent_path().string() + + "/" + external_qnn_context_binary_file_name); + size_t buffer_size{0}; + std::ifstream cache_file(context_binary_path.c_str(), std::ifstream::binary); + ORT_RETURN_IF(!cache_file || !cache_file.good(), "Failed to open cache file."); - cache_file.seekg(0, cache_file.beg); - std::unique_ptr buffer = std::make_unique(buffer_size); - ORT_RETURN_IF(nullptr == buffer, "Failed to allocate memory for cache file."); - // Load file into buffer - const auto& read_result = cache_file.read(buffer.get(), buffer_size); - ORT_RETURN_IF(!read_result, "Failed to read contents from cached context file."); - cache_file.close(); - return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(), - static_cast(buffer_size), - qnn_model); - } + cache_file.seekg(0, cache_file.end); + buffer_size = static_cast(cache_file.tellg()); + ORT_RETURN_IF(0 == buffer_size, "Empty cache file encountered."); - return Status::OK(); + cache_file.seekg(0, cache_file.beg); + std::unique_ptr buffer = std::make_unique(buffer_size); + ORT_RETURN_IF(nullptr == buffer, "Failed to allocate memory for cache file."); + // Load file into buffer + const auto& read_result = cache_file.read(buffer.get(), buffer_size); + ORT_RETURN_IF(!read_result, "Failed to read contents from cached context file."); + cache_file.close(); + return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(), + static_cast(buffer_size), + qnn_model); } Status QnnCacheModelHandler::GetMetadataFromEpContextModel(const std::string& ctx_onnx_model_path, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index fa859ce81be98..dd56731ac9f7f 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -570,8 +570,7 @@ Status QnnBackendManager::SetHtpPowerConfig() { "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; // Get power client id - uint32_t powerconfig_client_id = 0; - status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &powerconfig_client_id); + status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_); ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed."); constexpr const int kNumConfigs = 1; @@ -580,7 +579,7 @@ Status QnnBackendManager::SetHtpPowerConfig() { QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0]; dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config; - dcvs_v3.contextId = powerconfig_client_id; + dcvs_v3.contextId = htp_power_config_client_id_; dcvs_v3.setSleepDisable = 0; dcvs_v3.sleepDisable = 0; dcvs_v3.setDcvsEnable = 1; @@ -678,7 +677,7 @@ Status QnnBackendManager::SetHtpPowerConfig() { break; } std::vector perf_power_configs_ptr_ = ObtainNullTermPtrVector(power_configs); - status = htp_perf_infra.setPowerConfig(powerconfig_client_id, perf_power_configs_ptr_.data()); + status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data()); ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode."); // Set rpc control latency here, but note that v68 doesn't support rpc polling mode. @@ -692,7 +691,7 @@ Status QnnBackendManager::SetHtpPowerConfig() { rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_; perf_power_configs_ptr_ = ObtainNullTermPtrVector(rpc_power_configs); - status = htp_perf_infra.setPowerConfig(powerconfig_client_id, perf_power_configs_ptr_.data()); + status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data()); ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency."); } @@ -713,12 +712,36 @@ void QnnBackendManager::Split(std::vector& split_string, } } +Status QnnBackendManager::DestroyHTPPowerConfigID() { + if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) { + return Status::OK(); + } + + QnnDevice_Infrastructure_t qnn_device_infra = nullptr; + auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); + ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); + + auto* htp_infra = static_cast(qnn_device_infra); + ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType, + "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); + QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; + + Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_); + ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed."); + return Status::OK(); +} + void QnnBackendManager::ReleaseResources() { if (!backend_setup_completed_) { return; } - auto result = ReleaseContext(); + auto result = DestroyHTPPowerConfigID(); + if (Status::OK() != result) { + ORT_THROW("Failed to DestroyHTPPowerConfigID."); + } + + result = ReleaseContext(); if (Status::OK() != result) { ORT_THROW("Failed to ReleaseContext."); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 9cb6a322149b9..de5ccb5a28389 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -133,6 +133,8 @@ class QnnBackendManager { Status UnloadLib(void* handle); + Status DestroyHTPPowerConfigID(); + void* LibFunction(void* handle, const char* symbol, std::string& error_msg); template @@ -201,6 +203,7 @@ class QnnBackendManager { std::set mod_handles_; #endif const std::string qnn_saver_path_; + uint32_t htp_power_config_client_id_ = 0; }; } // namespace qnn diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h index 8649db92be027..6080c63b555a8 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_def.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h @@ -48,6 +48,14 @@ enum class HtpPerformanceMode : uint8_t { kHtpBalanced, }; +// Defines the graph optimization strategy used by the HTP backend. +enum class HtpGraphFinalizationOptimizationMode : uint8_t { + kDefault = 0, + kMode1 = 1, // Faster preparation time, less optimal graph + kMode2 = 2, // Longer preparation time, more optimal graph + kMode3 = 3, // Longest preparation time, most likely even more optimal graph. +}; + enum class QnnBackendType : uint8_t { CPU = 0, GPU, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc new file mode 100644 index 0000000000000..63aa01b48e7e2 --- /dev/null +++ b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/qnn/builder/qnn_graph_configs_helper.h" + +#include "HTP/QnnHtpGraph.h" + +namespace onnxruntime { +namespace qnn { + +const QnnGraph_Config_t** QnnGraphConfigsBuilder::GetQnnGraphConfigs() { + if (graph_config_ptrs_.empty()) { + return nullptr; + } + + if (!IsNullTerminated()) { + graph_config_ptrs_.push_back(nullptr); + } + + return graph_config_ptrs_.data(); +} + +QnnHtpGraph_CustomConfig_t& QnnGraphConfigsBuilder::PushHtpGraphCustomConfig() { + htp_custom_graph_configs_.push_back(QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT); + return htp_custom_graph_configs_.back(); +} + +QnnGraph_Config_t& QnnGraphConfigsBuilder::PushGraphConfig() { + graph_configs_.push_back(QNN_GRAPH_CONFIG_INIT); + QnnGraph_Config_t& config = graph_configs_.back(); + + // Add pointer to this new graph config to the list of graph config pointers. + if (IsNullTerminated()) { + graph_config_ptrs_.back() = &config; // Replace last nullptr entry. + } else { + graph_config_ptrs_.push_back(&config); + } + + return config; +} + +} // namespace qnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h new file mode 100644 index 0000000000000..8c4928fdacbc4 --- /dev/null +++ b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +#include "HTP/QnnHtpGraph.h" + +namespace onnxruntime { +namespace qnn { + +/** + * Helper class for building a null-terminated list of QNN Graph configurations. + * A QNN configuration consists of multiple objects with references to each other. This + * class ensures that all configuration objects have the same lifetime, so that they remain valid + * across the call to graphCreate(). + */ +class QnnGraphConfigsBuilder { + public: + /** + * Returns a pointer to the beginning of a null-terminated array of QNN Graph configurations. + * This result is passed QNN's graphCreate() API. + * + * \return Pointer to null-terminated QnnGraph_Config_t* array. + */ + const QnnGraph_Config_t** GetQnnGraphConfigs(); + + /** + * Creates and returns a reference to a new HTP graph configuration object. The object is initialized to + * the QNN recommended default value. The caller is meant to override fields in this object. + * + * \return A reference to a default QnnHtpGraph_CustomConfig_t object. + */ + QnnHtpGraph_CustomConfig_t& PushHtpGraphCustomConfig(); + + /** + * Creates and returns a reference to a new graph configuration object. The object is initialized to + * the QNN recommended default value. The caller is meant to override fields in this object. + * + * \return A reference to a default QnnGraph_Config_t object. + */ + QnnGraph_Config_t& PushGraphConfig(); + + private: + bool IsNullTerminated() const { + return !graph_config_ptrs_.empty() && graph_config_ptrs_.back() == nullptr; + } + + InlinedVector htp_custom_graph_configs_; + InlinedVector graph_configs_; + InlinedVector graph_config_ptrs_; +}; + +} // namespace qnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index 0a458f2602b89..fd3a95b5f1f78 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -87,7 +87,8 @@ const NodeUnit& QnnModel::GetNodeUnit(const Node* node, } Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer, - const onnxruntime::Node& fused_node) { + const onnxruntime::Node& fused_node, + const QnnGraph_Config_t** graph_configs) { LOGS(logger_, VERBOSE) << "ComposeGraph Graph name: " << graph_viewer.Name(); // Holder for the NodeUnits in the graph, this will guarantee the NodeUnits is @@ -107,7 +108,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer, initializer_inputs_, qnn_backend_manager_->GetQnnBackendType()); bool rt = true; - rt = qnn_model_wrapper.CreateQnnGraph(qnn_backend_manager_->GetQnnContext(), graph_name); + rt = qnn_model_wrapper.CreateQnnGraph(qnn_backend_manager_->GetQnnContext(), graph_name, graph_configs); if (!rt) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to initialize qnn_model_wrapper."); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h index 373995106f31c..de4f872f73ccf 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h @@ -27,7 +27,8 @@ class QnnModel { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnModel); Status ComposeGraph(const GraphViewer& graph_viewer, - const onnxruntime::Node& fused_node); + const onnxruntime::Node& fused_node, + const QnnGraph_Config_t** graph_configs = nullptr); Status FinalizeGraphs(); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index d3aafcbecd322..6cb276378a09c 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -76,6 +76,24 @@ void QNNExecutionProvider::ParseHtpPerformanceMode(std::string htp_performance_m } } +void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string) { + LOGS_DEFAULT(VERBOSE) << "HTP graph finalization optimization mode: " + << htp_graph_finalization_opt_mode_string; + + if (htp_graph_finalization_opt_mode_string.empty() || htp_graph_finalization_opt_mode_string == "0") { + htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; + } else if (htp_graph_finalization_opt_mode_string == "1") { + htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode1; + } else if (htp_graph_finalization_opt_mode_string == "2") { + htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode2; + } else if (htp_graph_finalization_opt_mode_string == "3") { + htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kMode3; + } else { + LOGS_DEFAULT(WARNING) << "Invalid HTP graph finalization optimization mode: " + << htp_graph_finalization_opt_mode_string; + } +} + QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options) : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true}, @@ -140,6 +158,13 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio ParseHtpPerformanceMode(htp_performance_mode_pos->second); } + htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; + static const std::string HTP_GRAPH_FINALIZATION_OPT_MODE = "htp_graph_finalization_optimization_mode"; + auto htp_graph_finalization_opt_mode_pos = runtime_options_.find(HTP_GRAPH_FINALIZATION_OPT_MODE); + if (htp_graph_finalization_opt_mode_pos != runtime_options_.end()) { + ParseHtpGraphFinalizationOptimizationMode(htp_graph_finalization_opt_mode_pos->second); + } + // Enable use of QNN Saver if the user provides a path the QNN Saver backend library. static const std::string QNN_SAVER_PATH_KEY = "qnn_saver_path"; std::string qnn_saver_path; @@ -448,6 +473,20 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector& nod return Status::OK(); } +void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_builder) const { + if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP && + htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) { + QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushHtpGraphCustomConfig(); + htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + htp_graph_opt_config.optimizationOption.floatValue = static_cast(htp_graph_finalization_opt_mode_); + + QnnGraph_Config_t& graph_opt_config = configs_builder.PushGraphConfig(); + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &htp_graph_opt_config; + } +} + Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs, const logging::Logger& logger) { @@ -458,7 +497,10 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector qnn_model = std::make_unique(logger, qnn_backend_manager_.get()); - ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node)); + qnn::QnnGraphConfigsBuilder graph_configs_builder; + InitQnnGraphConfigs(graph_configs_builder); + + ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnGraphConfigs())); ORT_RETURN_IF_ERROR(qnn_model->FinalizeGraphs()); ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput()); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index c63a60018aca8..a01b828531555 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -9,6 +9,7 @@ #include "core/providers/qnn/builder/qnn_backend_manager.h" #include "core/providers/qnn/builder/qnn_model.h" #include "core/providers/qnn/builder/onnx_ctx_model_helper.h" +#include "core/providers/qnn/builder/qnn_graph_configs_helper.h" namespace onnxruntime { @@ -57,10 +58,15 @@ class QNNExecutionProvider : public IExecutionProvider { void ParseHtpPerformanceMode(std::string htp_performance_mode_string); + void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string); + + void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const; + private: ProviderOptions runtime_options_; qnn::ProfilingLevel profiling_level_ = qnn::ProfilingLevel::OFF; qnn::HtpPerformanceMode htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; + qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; std::unique_ptr qnn_backend_manager_; std::unordered_map> qnn_models_; uint32_t rpc_control_latency_ = 0; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 0526ccca5bb4e..98646058eec3d 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -59,6 +59,8 @@ void usage() { "\t [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n" "\t 0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n" "\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n" + "\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n" + "\t '0', '1', '2', '3', default is '0'.\n" "\t [Usage]: -e -i '| |' \n\n" "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n" "\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n" @@ -488,9 +490,19 @@ int real_main(int argc, char* argv[], Ort::Env& env) { } } else if (key == "qnn_saver_path") { // no validation + } else if (key == "htp_graph_finalization_optimization_mode") { + std::unordered_set supported_htp_graph_final_opt_modes = {"0", "1", "2", "3"}; + if (supported_htp_graph_final_opt_modes.find(value) == supported_htp_graph_final_opt_modes.end()) { + std::ostringstream str_stream; + std::copy(supported_htp_graph_final_opt_modes.begin(), supported_htp_graph_final_opt_modes.end(), + std::ostream_iterator(str_stream, ",")); + std::string str = str_stream.str(); + ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); + } } else { ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', -'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode'])"); +'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path', +'htp_graph_finalization_optimization_mode'])"); } qnn_options[key] = value; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 6d075fec997b5..72472e5798792 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -71,6 +71,9 @@ namespace perftest { "\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n" "\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n" "\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n" + "\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n" + "\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n" + "\t '0', '1', '2', '3', default is '0'.\n" "\t [Usage]: -e -i '| |'\n\n" "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"\"\"\n" "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index b7a111783fc94..f3ea188043dbe 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -356,9 +356,21 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device std::string str = str_stream.str(); ORT_THROW("Supported htp_performance_mode: " + str); } + } else if (key == "qnn_saver_path") { + // no validation + } else if (key == "htp_graph_finalization_optimization_mode") { + std::unordered_set supported_htp_graph_final_opt_modes = {"0", "1", "2", "3"}; + if (supported_htp_graph_final_opt_modes.find(value) == supported_htp_graph_final_opt_modes.end()) { + std::ostringstream str_stream; + std::copy(supported_htp_graph_final_opt_modes.begin(), supported_htp_graph_final_opt_modes.end(), + std::ostream_iterator(str_stream, ",")); + std::string str = str_stream.str(); + ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); + } } else { ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', -'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode'])"); +'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path', +'htp_graph_finalization_optimization_mode'])"); } qnn_options[key] = value; diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 5f63813d8d84e..02ff834169b2b 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -173,7 +173,8 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) { // The models passed to this function are subgraphs extracted from a larger model that exhibited // shape inferencing issues on QNN. Thus, the models are expected to have a specific input/output // types and shapes. -static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false) { +static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false, + std::string htp_graph_finalization_opt_mode = "") { Ort::SessionOptions so; // Ensure all type/shape inference warnings result in errors! @@ -194,6 +195,10 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo } #endif + if (!htp_graph_finalization_opt_mode.empty()) { + options["htp_graph_finalization_optimization_mode"] = std::move(htp_graph_finalization_opt_mode); + } + so.AppendExecutionProvider("QNN", options); Ort::Session session(*ort_env, ort_model_path, so); @@ -302,6 +307,21 @@ TEST_F(QnnHTPBackendTests, QnnSaver_OutputFiles) { EXPECT_TRUE(std::filesystem::exists(qnn_saver_output_dir / "params.bin")); } +// Test that models run with various HTP graph finalization optimization modes. +TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) { + constexpr std::array graph_opt_modes = {"", // No explicit mode specified + "0", // Explicit default mode + "1", // Mode 1 + "2", // Mode 2 + "3"}; // Mode 3 + for (auto mode : graph_opt_modes) { + RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx", + true, // use_htp + false, // enable_qnn_saver + mode); // htp_graph_finalization_opt_mode + } +} + #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) #endif // !defined(ORT_MINIMAL_BUILD) From a37e6a503b3073605b1eb765d1b0ff5a61360991 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 8 Nov 2023 11:19:45 -0800 Subject: [PATCH 06/13] Update Abseil raw_flat_hash visualization (#18329) ### Description Fix the broken pieces due to the latest Abseil update. ### Motivation and Context - empty - {{ size={size_} }} + + + + + + empty + size={ _size() } + size=({_size()}) - size_ - capacity_ - + _size() + _capacity() + - size_ + _size() - - slots_[nslot] + + _slots()[nslot] nslot++ - + + + + + *($T1 *){value} + (*($T1 *){value}) + + *($T1 *){value} + + + + + + *($T1 *)this + (*($T1 *)this) + + *($T1 *)this + + + - {{ {value.first}:{value.second} }} + {value.first}, {value.second} + ({value.first}, {value.second}) - value.first - value.second + value.first + value.second From 68fab24c22e1453ebfcd9d67e857c6531985faa0 Mon Sep 17 00:00:00 2001 From: sophies927 <107952697+sophies927@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:56:35 -0800 Subject: [PATCH 07/13] Update stale.yml (#18304) Exempt all issues w/ assignees from stale bot, increase days before issue close, + add start date to address issue w/ GH API rate limiting ### Description ### Motivation and Context --- .github/workflows/stale.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 67d8550d44204..e49e0921af434 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -17,10 +17,14 @@ jobs: with: # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale exempt-issue-labels: contributions welcome, feature request, regression + # Override exempt-all-assignees but only to exempt the issues with an assignee to be marked as stale automatically + exempt-all-issue-assignees: true + # Used to ignore the issues and pull requests created before the start date + start-date: 20220419 # Number of days without activity before the actions/stale action labels an issue days-before-issue-stale: 30 # Number of days without activity before the actions/stale action closes an issue - days-before-issue-close: 7 + days-before-issue-close: 30 # Label you want to apply to issues that have been inactive for the amount of time specified by days-before-issue-stale stale-issue-label: "stale" # Comment that you want to add to issues that are labeled by the actions/stale action From 812532592e070823f0362af8c3219ca6b01eda77 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 8 Nov 2023 13:03:18 -0800 Subject: [PATCH 08/13] Add a build validation for Linux ARM64 cross-compile (#18200) ### Description 1. Add a build validation for Linux ARM64/ARM32 cross-compile to catch issues listed in #18195 . 2. Revert eigen's commit id back to what we had before. ### Motivation and Context To catch cross-compile issues. Added a TODO item for fixing the compile warnings in Linux ARM32 build: AB#21639 --- cgmanifests/cgmanifest.json | 2 +- cgmanifests/generated/cgmanifest.json | 4 +- cmake/deps.txt | 9 ++- .../linux_arm32_crosscompile_toolchain.cmake | 9 +++ .../linux_arm64_crosscompile_toolchain.cmake | 9 +++ onnxruntime/core/common/threadpool.cc | 12 ++-- onnxruntime/core/platform/windows/env.cc | 2 +- .../azure-pipelines/linux-ci-pipeline.yml | 58 ++++++++++++++++++- .../templates/download-deps.yml | 4 +- tools/ci_build/replace_urls_in_deps.py | 12 ++++ 10 files changed, 104 insertions(+), 17 deletions(-) create mode 100644 cmake/linux_arm32_crosscompile_toolchain.cmake create mode 100644 cmake/linux_arm64_crosscompile_toolchain.cmake diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json index 2a3de3bb0ee51..e8dbc9cf9eff6 100644 --- a/cgmanifests/cgmanifest.json +++ b/cgmanifests/cgmanifest.json @@ -568,7 +568,7 @@ "component": { "type": "git", "git": { - "commitHash": "d10b27fe37736d2944630ecd7557cefa95cf87c9", + "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac", "repositoryUrl": "https://gitlab.com/libeigen/eigen.git" } } diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 6b0e3659bd234..0886a29fa573e 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -136,7 +136,7 @@ "component": { "type": "git", "git": { - "commitHash": "003c580e696a774afdc984996ee909b7c8d8128c", + "commitHash": "0da379fc4808f9601faef392352018c741c0f297", "repositoryUrl": "https://github.com/google/XNNPACK.git" }, "comments": "googlexnnpack" @@ -226,7 +226,7 @@ "component": { "type": "git", "git": { - "commitHash": "1787867f6183f056420e532eec640cba25efafea", + "commitHash": "4fe0e1e183925bf8cfa6aae24237e724a96479b8", "repositoryUrl": "https://github.com/Maratyszcza/pthreadpool.git" }, "comments": "pthreadpool" diff --git a/cmake/deps.txt b/cmake/deps.txt index 4aab8c974d62f..275b5eaf6b976 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -9,14 +9,19 @@ #since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would #not affect built binaries. # -# NOTE: You must run deps_update_and_upload.py when ready to test your changes in a CI. +# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI. # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29 # abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.0.zip;04271dfbfac59269b6939e1e9d5faf0d18a7ba91 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159 dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445 -eigen;https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip;ef24286b7ece8737c99fa831b02941843546c081 +# This Eigen commit id matches the eigen archive being consumed from https://gitlab.com/libeigen/eigen/-/archive/3.4/eigen-3.4.zip +# prior to the 3.4.1 RC changing the bits and invalidating the hash. +# it contains changes on top of 3.4.0 which are required to fix build issues. +# Until the 3.4.1 release this is the best option we have. +# Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744 +eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 diff --git a/cmake/linux_arm32_crosscompile_toolchain.cmake b/cmake/linux_arm32_crosscompile_toolchain.cmake new file mode 100644 index 0000000000000..0183262a8875e --- /dev/null +++ b/cmake/linux_arm32_crosscompile_toolchain.cmake @@ -0,0 +1,9 @@ + #This file is just a sample. You may need to modify it before using. + SET(CMAKE_SYSTEM_NAME Linux) + SET(CMAKE_SYSTEM_VERSION 1) + SET(CMAKE_C_COMPILER arm-none-linux-gnueabihf-gcc) + SET(CMAKE_CXX_COMPILER arm-none-linux-gnueabihf-g++) + SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) + SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + SET(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) \ No newline at end of file diff --git a/cmake/linux_arm64_crosscompile_toolchain.cmake b/cmake/linux_arm64_crosscompile_toolchain.cmake new file mode 100644 index 0000000000000..1a492bbc269e7 --- /dev/null +++ b/cmake/linux_arm64_crosscompile_toolchain.cmake @@ -0,0 +1,9 @@ + #This file is just a sample. You may need to modify it before using. + SET(CMAKE_SYSTEM_NAME Linux) + SET(CMAKE_SYSTEM_VERSION 1) + SET(CMAKE_C_COMPILER aarch64-none-linux-gnu-gcc) + SET(CMAKE_CXX_COMPILER aarch64-none-linux-gnu-g++) + SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) + SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + SET(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) \ No newline at end of file diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc index f29ab19608934..10e117267e14b 100644 --- a/onnxruntime/core/common/threadpool.cc +++ b/onnxruntime/core/common/threadpool.cc @@ -562,7 +562,7 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso constexpr ptrdiff_t max_oversharding_factor = 4; ptrdiff_t block_size = Eigen::numext::mini( n, - Eigen::numext::maxi(Eigen::divup(n, max_oversharding_factor * num_threads), static_cast(block_size_f))); + Eigen::numext::maxi(Eigen::numext::div_ceil(n, max_oversharding_factor * num_threads), static_cast(block_size_f))); const ptrdiff_t max_block_size = Eigen::numext::mini(n, 2 * block_size); if (block_align) { @@ -571,19 +571,19 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso block_size = Eigen::numext::mini(n, new_block_size); } - ptrdiff_t block_count = Eigen::divup(n, block_size); + ptrdiff_t block_count = Eigen::numext::div_ceil(n, block_size); // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = - static_cast(block_count) / (Eigen::divup(block_count, num_threads) * num_threads); + static_cast(block_count) / (Eigen::numext::div_ceil(block_count, num_threads) * num_threads); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. for (ptrdiff_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. - ptrdiff_t coarser_block_size = Eigen::divup(n, prev_block_count - 1); + ptrdiff_t coarser_block_size = Eigen::numext::div_ceil(n, prev_block_count - 1); if (block_align) { ptrdiff_t new_block_size = block_align(coarser_block_size); assert(new_block_size >= coarser_block_size); @@ -593,11 +593,11 @@ static ptrdiff_t CalculateParallelForBlock(const ptrdiff_t n, const Eigen::Tenso break; // Reached max block size. Stop. } // Recalculate parallel efficiency. - const ptrdiff_t coarser_block_count = Eigen::divup(n, coarser_block_size); + const ptrdiff_t coarser_block_count = Eigen::numext::div_ceil(n, coarser_block_size); assert(coarser_block_count < prev_block_count); prev_block_count = coarser_block_count; const double coarser_efficiency = - static_cast(coarser_block_count) / (Eigen::divup(coarser_block_count, num_threads) * num_threads); + static_cast(coarser_block_count) / (Eigen::numext::div_ceil(coarser_block_count, num_threads) * num_threads); if (coarser_efficiency + 0.01 >= max_efficiency) { // Taking it. block_size = coarser_block_size; diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index f02c61daabeed..45648010baf86 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#include "unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h" +#include #include #include "core/platform/path_lib.h" // for LoopDir() diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml index 395c190ce9e11..f46febee178e1 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml @@ -56,10 +56,23 @@ stages: clean: true submodules: none - - task: UsePythonVersion@0 + - task: DownloadPackage@1 + displayName: 'Download ARM64 GCC' inputs: - versionSpec: '3.8' - addToPath: true + packageType: upack + feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' + definition: 'gcc_aarch64_linux_gnu_host_x86_64' + version: 13.2.1 + downloadPath: $(Build.BinariesDirectory)/gcc + + - task: DownloadPackage@1 + displayName: 'Download ARM32 GCC' + inputs: + packageType: upack + feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' + definition: 'gcc_aarch32_linux_gnu_host_x86_64' + version: 13.2.1 + downloadPath: $(Build.BinariesDirectory)/gcc - template: templates/get-docker-image-steps.yml parameters: @@ -68,6 +81,45 @@ stages: DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi" Repository: onnxruntimecpubuild + - task: PythonScript@0 + displayName: 'Update deps.txt' + inputs: + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py + arguments: --new_dir $(Build.BinariesDirectory)/deps + workingDirectory: $(Build.BinariesDirectory) + pythonInterpreter: /usr/bin/python3 + + - script: | + set -e -x + # ARM64 build + mkdir -p $(Build.BinariesDirectory)/gccbin + tar -Jxf $(Build.BinariesDirectory)/gcc/arm-gnu-toolchain-13.2.rel1-x86_64-aarch64-none-linux-gnu.tar.xz --strip=1 -C $(Build.BinariesDirectory)/gccbin + export PATH=$(Build.BinariesDirectory)/gccbin/bin:$PATH + mkdir $(Build.BinariesDirectory)/aarch64build + cd $(Build.BinariesDirectory)/aarch64build + cmake $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm64_crosscompile_toolchain.cmake -G Ninja + ninja + rm -rf $(Build.BinariesDirectory)/aarch64build $(Build.BinariesDirectory)/gccbin + # ARM32 build + mkdir -p $(Build.BinariesDirectory)/gccbin + tar -Jxf $(Build.BinariesDirectory)/gcc/arm-gnu-toolchain-13.2.rel1-x86_64-arm-none-linux-gnueabihf.tar.xz --strip=1 -C $(Build.BinariesDirectory)/gccbin + ls $(Build.BinariesDirectory)/gccbin/bin + mkdir $(Build.BinariesDirectory)/arm32build + cd $(Build.BinariesDirectory)/arm32build + # TODO: fix the warnings and remove the --compile-no-warning-as-error arg + cmake --compile-no-warning-as-error $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm32_crosscompile_toolchain.cmake -G Ninja + ninja + rm -rf $(Build.BinariesDirectory)/arm32build $(Build.BinariesDirectory)/gccbin + displayName: Cross-compile for Linux ARM32 and ARM64 + + - task: PythonScript@0 + displayName: 'Update deps.txt' + inputs: + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py + arguments: --new_dir /build/deps + workingDirectory: $(Build.BinariesDirectory) + pythonInterpreter: /usr/bin/python3 + - template: templates/linux-build-step-with-cache.yml parameters: WithCache: true diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index bb9f1fd277e2d..a78f743c15347 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.110 + version: 1.0.114 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.110 + version: 1.0.114 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py index 28e3c91107c6c..ac4f515d5482b 100644 --- a/tools/ci_build/replace_urls_in_deps.py +++ b/tools/ci_build/replace_urls_in_deps.py @@ -8,6 +8,7 @@ import argparse import csv import os +import shutil from dataclasses import dataclass from pathlib import Path @@ -46,7 +47,16 @@ def main(): deps = [] csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt" + backup_csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt.bak" + # prefer to use the backup file + if backup_csv_file_path.exists(): + csv_file_path = backup_csv_file_path + else: + # Make a copy before modifying it + print("Making a copy to %s" % str(backup_csv_file_path)) + shutil.copy(csv_file_path, backup_csv_file_path) + print("Reading from %s" % str(csv_file_path)) # Read the whole file into memory first with csv_file_path.open("r", encoding="utf-8") as f: depfile_reader = csv.reader(f, delimiter=";") @@ -58,6 +68,8 @@ def main(): continue deps.append(Dep(row[0], row[1], row[2])) + csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt" + print("Writing to %s" % str(csv_file_path)) # Write updated content back with csv_file_path.open("w", newline="", encoding="utf-8") as f: depfile_writer = csv.writer(f, delimiter=";") From c250540722a8028250de1d572365ec73ef6ecedf Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 8 Nov 2023 13:04:40 -0800 Subject: [PATCH 09/13] Bump linter versions (#18341) Bump linter versions and run format. --- .../core/platform/EigenNonBlockingThreadPool.h | 2 +- onnxruntime/python/backend/backend.py | 4 ++-- .../models/stable_diffusion/diffusion_models.py | 2 +- .../providers/cpu/math/element_wise_ops_test.cc | 4 ++-- .../providers/cpu/reduction/reduction_ops_test.cc | 4 ++-- .../test/providers/nnapi/nnapi_basic_test.cc | 9 +++++---- .../python/training/ortmodule/_fallback.py | 4 ++-- requirements-lintrunner.txt | 8 ++++---- tools/ci_build/build.py | 10 ++++------ ...nd_optimizer_opset_version_updates_required.py | 2 +- winml/lib/Api.Image/inc/ImageConversionHelpers.h | 3 ++- .../Api.Image/inc/TensorToVideoFrameConverter.h | 5 +---- .../Api.Image/inc/VideoFrameToTensorConverter.h | 5 +---- winml/lib/Api.Ort/OnnxruntimeModel.cpp | 6 ++++-- winml/lib/Api/impl/TensorBase.h | 3 ++- winml/test/adapter/AdapterDmlEpTest.cpp | 6 ++++-- winml/test/adapter/AdapterSessionTest.cpp | 3 ++- winml/test/api/LearningModelAPITest.cpp | 9 ++++++--- winml/test/api/LearningModelBindingAPITest.cpp | 3 ++- winml/test/api/LearningModelSessionAPITest.cpp | 9 ++++++--- winml/test/api/RawApiHelpers.cpp | 3 ++- winml/test/api/RawApiTestsGpu.cpp | 3 ++- winml/test/concurrency/ConcurrencyTests.cpp | 12 ++++++++---- winml/test/image/imageTestHelper.cpp | 3 ++- winml/test/image/imagetests.cpp | 3 ++- winml/test/model/model_tests.cpp | 3 ++- winml/test/model/skip_model_tests.h | 10 ++++------ winml/test/scenario/cppwinrt/CustomNullOp.h | 3 ++- winml/test/scenario/cppwinrt/CustomOps.cpp | 15 ++++++++------- winml/test/scenario/cppwinrt/NoisyReluCpu.h | 12 ++++++++---- winml/test/scenario/cppwinrt/ReluCpu.h | 3 ++- .../scenario/cppwinrt/scenariotestscppwinrt.cpp | 9 ++++++--- 32 files changed, 102 insertions(+), 78 deletions(-) diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h index a57385f6e23f1..f9b694efb936f 100644 --- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h +++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h @@ -278,7 +278,7 @@ class ThreadPoolProfiler { int num_threads_; #ifdef _MSC_VER #pragma warning(push) -// C4324: structure was padded due to alignment specifier + // C4324: structure was padded due to alignment specifier #pragma warning(disable : 4324) #endif // _MSC_VER struct ORT_ALIGN_TO_AVOID_FALSE_SHARING ChildThreadStat { diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py index 1edae383e93e6..97b7358f2a223 100644 --- a/onnxruntime/python/backend/backend.py +++ b/onnxruntime/python/backend/backend.py @@ -63,7 +63,7 @@ def is_opset_supported(cls, model): error_message = ( "Skipping this test as only released onnx opsets are supported." "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0." - " Got Domain '{}' version '{}'.".format(domain, opset.version) + f" Got Domain '{domain}' version '{opset.version}'." ) return False, error_message except AttributeError: @@ -74,7 +74,7 @@ def is_opset_supported(cls, model): error_message = ( "Skipping this test as only released onnx opsets are supported." "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0." - " Got Domain '{}' version '{}'.".format(domain, opset.version) + f" Got Domain '{domain}' version '{opset.version}'." ) return False, error_message return True, "" diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py index d93ca8dba7fa0..3dcde7f6db398 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py @@ -463,7 +463,7 @@ def add_hidden_states_graph_output(self, model: ModelProto, optimized_onnx_path, assert self.clip_skip >= 0 and self.clip_skip < hidden_layers - node_output_name = "/text_model/encoder/layers.{}/Add_1_output_0".format(hidden_layers - 1 - self.clip_skip) + node_output_name = f"/text_model/encoder/layers.{hidden_layers - 1 - self.clip_skip}/Add_1_output_0" # search the name in outputs of all node found = False diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index 257ce977700a6..5e746ed0c62d4 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -1238,7 +1238,7 @@ TEST(MathOpTest, Sum_8_Test1) { // This test runs fine on CPU Plugin test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum #endif } @@ -1264,7 +1264,7 @@ TEST(MathOpTest, Sum_8_Test1_double) { // This test runs fine on CPU Plugin test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum #endif } TEST(MathOpTest, Sum_8_Test2) { diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index c9b851e450f9d..79da8004a9edd 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -1086,7 +1086,7 @@ TEST(ReductionOpTest, ReduceMax_int32) { #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: axis must be 0 #endif } @@ -1107,7 +1107,7 @@ TEST(ReductionOpTest, ReduceMax_int64) { #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: axis must be 0 #endif } diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc index 0e783a94c5479..b3e1025e7367c 100644 --- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc +++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc @@ -556,10 +556,11 @@ TEST(NnapiExecutionProviderTest, ActivationOutsideOfPartition) { constexpr auto* model_file_name = ORT_TSTR("testdata/mnist.basic.ort"); // stop NNAPI partitioning at Relu so NNAPI EP only takes first Conv const auto nnapi_partitioning_stop_ops = "Relu"; - TestModelLoad(model_file_name, std::make_unique(0, nnapi_partitioning_stop_ops), - // expect one NNAPI partition - [](const Graph& graph) { ASSERT_EQ(CountAssignedNodes(graph, kNnapiExecutionProvider), 1) - << "Exactly one node should have been taken by the NNAPI EP"; }); + TestModelLoad( + model_file_name, std::make_unique(0, nnapi_partitioning_stop_ops), + // expect one NNAPI partition + [](const Graph& graph) { ASSERT_EQ(CountAssignedNodes(graph, kNnapiExecutionProvider), 1) + << "Exactly one node should have been taken by the NNAPI EP"; }); } } // namespace test diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py index 44f96dcff7fb0..56bb45d064d8a 100644 --- a/orttraining/orttraining/python/training/ortmodule/_fallback.py +++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py @@ -175,9 +175,9 @@ def fallback(self, log_level: _logger.LogLevel, *inputs, **kwargs): # This warning will not be raised again if retry is not enabled self._logger.warning( - "Fallback to PyTorch due to exception {} was triggered. " + f"Fallback to PyTorch due to exception {exception_type} was triggered. " "Report this issue with a minimal repro at https://www.github.com/microsoft/onnxruntime. " - "See details below:\n\n{}".format(exception_type, exception_string) + f"See details below:\n\n{exception_string}" ) self._raised_fallback_exception = True diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt index 42c016984bc0f..25454ce40c263 100644 --- a/requirements-lintrunner.txt +++ b/requirements-lintrunner.txt @@ -1,9 +1,9 @@ # This file is auto updated by dependabot -lintrunner-adapters>=0.8.0 +lintrunner-adapters>=0.11.0 # RUFF -ruff==0.0.292 +ruff==0.1.4 # BLACK-ISORT -black==23.7.0 +black==23.10.1 isort==5.12.0 # CLANGFORMAT -clang-format==16.0.6 +clang-format==17.0.4 diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index a38087aa0313a..e0559419ef8c7 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1637,9 +1637,7 @@ def run_adb_shell(cmd): # GCOV_PREFIX specifies the root directory # for creating the runtime code coverage files. if args.code_coverage: - adb_shell( - "cd {0} && GCOV_PREFIX={0} GCOV_PREFIX_STRIP={1} {2}".format(device_dir, cwd.count(os.sep) + 1, cmd) - ) + adb_shell(f"cd {device_dir} && GCOV_PREFIX={device_dir} GCOV_PREFIX_STRIP={cwd.count(os.sep) + 1} {cmd}") else: adb_shell(f"cd {device_dir} && {cmd}") @@ -1689,7 +1687,7 @@ def run_adb_shell(cmd): ) if args.use_nnapi: - run_adb_shell("{0}/onnx_test_runner -e nnapi {0}/test".format(device_dir)) + run_adb_shell(f"{device_dir}/onnx_test_runner -e nnapi {device_dir}/test") else: run_adb_shell(f"{device_dir}/onnx_test_runner {device_dir}/test") @@ -1702,9 +1700,9 @@ def run_adb_shell(cmd): adb_push("onnxruntime_customopregistration_test", device_dir, cwd=cwd) adb_shell(f"chmod +x {device_dir}/onnxruntime_shared_lib_test") adb_shell(f"chmod +x {device_dir}/onnxruntime_customopregistration_test") - run_adb_shell("LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_shared_lib_test".format(device_dir)) + run_adb_shell(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_shared_lib_test") run_adb_shell( - "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_customopregistration_test".format(device_dir) + f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_customopregistration_test" ) diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py index 0076d27fe950e..8a5e57b51e38d 100644 --- a/tools/python/find_optimizer_opset_version_updates_required.py +++ b/tools/python/find_optimizer_opset_version_updates_required.py @@ -54,7 +54,7 @@ def get_call_args_from_file(filename: str, function_or_declaration: str) -> typi # TODO: handle automatically by merging lines log.error( "Call/Declaration is split over multiple lines. Please check manually." - "File:{} Line:{}".format(filename, line_num) + f"File:{filename} Line:{line_num}" ) continue diff --git a/winml/lib/Api.Image/inc/ImageConversionHelpers.h b/winml/lib/Api.Image/inc/ImageConversionHelpers.h index 8e3dca2ae11e8..5a9c8f21255b5 100644 --- a/winml/lib/Api.Image/inc/ImageConversionHelpers.h +++ b/winml/lib/Api.Image/inc/ImageConversionHelpers.h @@ -52,5 +52,6 @@ bool VideoFramesHaveSameDevice(const wm::IVideoFrame& video_frame_1, const wm::I wgdx::Direct3D11::IDirect3DDevice GetDeviceFromDirect3DSurface(const wgdx::Direct3D11::IDirect3DSurface& d3dSurface); constexpr std::array supportedWinMLFormats = { - DXGI_FORMAT_R8G8B8A8_UNORM, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_B8G8R8X8_UNORM}; + DXGI_FORMAT_R8G8B8A8_UNORM, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_B8G8R8X8_UNORM +}; } // namespace _winml::Imaging diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h index a3d01cdff42fb..138e755289df9 100644 --- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h +++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h @@ -40,10 +40,7 @@ class TensorToVideoFrameConverter : public ImageConverter { private: GUID _d3d11TextureGUID = { - 0x14bf1054, - 0x6ce7, - 0x4c00, - {0xa1, 0x32, 0xb0, 0xf2, 0x11, 0x5D, 0xE0, 0x7f} + 0x14bf1054, 0x6ce7, 0x4c00, {0xa1, 0x32, 0xb0, 0xf2, 0x11, 0x5D, 0xE0, 0x7f} }; // {14BF1054-6CE7-4C00-A132-B0F2115DE07F} GUID _handleGUID = { 0x700148fc, 0xc0cb, 0x4a7e, {0xa7, 0xc0, 0xe7, 0x43, 0xc1, 0x9, 0x9d, 0x62} diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h index 0cd184ca70b21..ebfccabf7e814 100644 --- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h +++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h @@ -50,10 +50,7 @@ class VideoFrameToTensorConverter : public ImageConverter { private: GUID d3d11_texture_GUID_ = { - 0x485e4bb3, - 0x3fe8, - 0x497b, - {0x85, 0x9e, 0xc7, 0x5, 0x18, 0xdb, 0x11, 0x2a} + 0x485e4bb3, 0x3fe8, 0x497b, {0x85, 0x9e, 0xc7, 0x5, 0x18, 0xdb, 0x11, 0x2a} }; // {485E4BB3-3FE8-497B-859E-C70518DB112A} GUID handle_GUID_ = { 0xce43264e, 0x41f7, 0x4882, {0x9e, 0x20, 0xfa, 0xa5, 0x1e, 0x37, 0x64, 0xfc} diff --git a/winml/lib/Api.Ort/OnnxruntimeModel.cpp b/winml/lib/Api.Ort/OnnxruntimeModel.cpp index fb8413a897e75..24eb44b73dd3c 100644 --- a/winml/lib/Api.Ort/OnnxruntimeModel.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeModel.cpp @@ -81,7 +81,8 @@ HRESULT ModelInfo::RuntimeClassInitialize(_In_ OnnxruntimeEngineFactory* engine_ winml_adapter_api->ModelGetInputCount, winml_adapter_api->ModelGetInputName, winml_adapter_api->ModelGetInputDescription, - winml_adapter_api->ModelGetInputTypeInfo}; + winml_adapter_api->ModelGetInputTypeInfo + }; // Create inputs std::vector inputs; @@ -93,7 +94,8 @@ HRESULT ModelInfo::RuntimeClassInitialize(_In_ OnnxruntimeEngineFactory* engine_ winml_adapter_api->ModelGetOutputCount, winml_adapter_api->ModelGetOutputName, winml_adapter_api->ModelGetOutputDescription, - winml_adapter_api->ModelGetOutputTypeInfo}; + winml_adapter_api->ModelGetOutputTypeInfo + }; std::vector outputs; RETURN_IF_FAILED(CreateFeatureDescriptors(engine_factory, &output_helpers, ort_model, outputs)); diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h index c8332e11234b5..6c68abd3ce6c9 100644 --- a/winml/lib/Api/impl/TensorBase.h +++ b/winml/lib/Api/impl/TensorBase.h @@ -217,7 +217,8 @@ struct TensorBase : TBase { } D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; D3D12_RESOURCE_DESC resourceDesc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp index 81437f9db2de3..b4220650abb9c 100644 --- a/winml/test/adapter/AdapterDmlEpTest.cpp +++ b/winml/test/adapter/AdapterDmlEpTest.cpp @@ -116,7 +116,8 @@ std::array tensor_values = {}; winrt::com_ptr CreateD3D12Resource(ID3D12Device& device) { constexpr uint64_t buffer_size = tensor_size * sizeof(float); constexpr D3D12_HEAP_PROPERTIES heap_properties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; constexpr D3D12_RESOURCE_DESC resource_desc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, @@ -365,6 +366,7 @@ const AdapterDmlEpTestApi& getapi() { DmlCopyTensor, CreateCustomRegistry, ValueGetDeviceId, - SessionGetInputRequiredDeviceId}; + SessionGetInputRequiredDeviceId + }; return api; } diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp index 1b1a36004264c..8c9124b2ff4ae 100644 --- a/winml/test/adapter/AdapterSessionTest.cpp +++ b/winml/test/adapter/AdapterSessionTest.cpp @@ -368,7 +368,8 @@ const AdapterSessionTestAPI& getapi() { Profiling, CopyInputAcrossDevices, CopyInputAcrossDevices_DML, - GetNumberOfIntraOpThreads}; + GetNumberOfIntraOpThreads + }; if (SkipGpuTests()) { api.AppendExecutionProvider_DML = SkipTest; diff --git a/winml/test/api/LearningModelAPITest.cpp b/winml/test/api/LearningModelAPITest.cpp index ab45e2414854d..01ca2b8930506 100644 --- a/winml/test/api/LearningModelAPITest.cpp +++ b/winml/test/api/LearningModelAPITest.cpp @@ -247,9 +247,11 @@ static void CheckLearningModelPixelRange() { // Normalized_0_1 and image output L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", // Normalized_1_1 and image output - L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"}; + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx" + }; std::vector pixelRanges = { - LearningModelPixelRange::ZeroTo255, LearningModelPixelRange::ZeroToOne, LearningModelPixelRange::MinusOneToOne}; + LearningModelPixelRange::ZeroTo255, LearningModelPixelRange::ZeroToOne, LearningModelPixelRange::MinusOneToOne + }; for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) { LearningModel learningModel = nullptr; WINML_EXPECT_NO_THROW(APITest::LoadModel(modelPaths[model_i], learningModel)); @@ -329,7 +331,8 @@ const LearningModelApiTestsApi& getapi() { CloseModelCheckEval, CloseModelNoNewSessions, CheckMetadataCaseInsensitive, - CreateCorruptModel}; + CreateCorruptModel + }; if (RuntimeParameterExists(L"noVideoFrameTests")) { api.CloseModelCheckEval = SkipTest; diff --git a/winml/test/api/LearningModelBindingAPITest.cpp b/winml/test/api/LearningModelBindingAPITest.cpp index b77421e191020..8279f4f89f0ed 100644 --- a/winml/test/api/LearningModelBindingAPITest.cpp +++ b/winml/test/api/LearningModelBindingAPITest.cpp @@ -669,7 +669,8 @@ const LearningModelBindingAPITestsApi& getapi() { VerifyOutputAfterEvaluateAsyncCalledTwice, VerifyOutputAfterImageBindCalledTwice, SequenceLengthTensorFloat, - SequenceConstructTensorString}; + SequenceConstructTensorString + }; if (SkipGpuTests()) { api.GpuSqueezeNet = SkipTest; diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp index 21cdaa62bc898..4ec79b8a0f4c6 100644 --- a/winml/test/api/LearningModelSessionAPITest.cpp +++ b/winml/test/api/LearningModelSessionAPITest.cpp @@ -793,7 +793,8 @@ static void STFT( auto n_dfts = static_cast(1 + floor((signal_size - dft_size) / hop_size)); auto input_shape = std::vector{1, INT64(signal_size)}; auto output_shape = std::vector{ - INT64(batch_size), INT64(n_dfts), is_onesided ? ((INT64(dft_size) >> 1) + 1) : INT64(dft_size), 2}; + INT64(batch_size), INT64(n_dfts), is_onesided ? ((INT64(dft_size) >> 1) + 1) : INT64(dft_size), 2 + }; auto dft_length = TensorInt64Bit::CreateFromArray({}, {INT64(dft_size)}); auto model = @@ -1372,7 +1373,8 @@ static void ModelBuilding_GridSample_Internal(LearningModelDeviceKind kind) { 5.0000f, 5.0000f, 10.0000f, - 10.0000f}; + 10.0000f + }; input_dims = {1, 1, 3, 2}; grid_dims = {1, 2, 4, 2}; @@ -2312,7 +2314,8 @@ const LearningModelSessionAPITestsApi& getapi() { ModelBuilding_STFT, ModelBuilding_MelSpectrogramOnThreeToneSignal, ModelBuilding_MelWeightMatrix, - SetName}; + SetName + }; if (SkipGpuTests()) { api.CreateSessionDeviceDirectX = SkipTest; diff --git a/winml/test/api/RawApiHelpers.cpp b/winml/test/api/RawApiHelpers.cpp index b6f39f8e88224..e84af6d239799 100644 --- a/winml/test/api/RawApiHelpers.cpp +++ b/winml/test/api/RawApiHelpers.cpp @@ -38,7 +38,8 @@ void RunOnDevice(ml::learning_model& model, ml::learning_model_device& device, I auto channel_buffers_pointers = std::vector{ &input_data.at(0), &input_data.at(0) + channel_buffers_sizes[0], - &input_data.at(0) + channel_buffers_sizes[0] + +channel_buffers_sizes[1]}; + &input_data.at(0) + channel_buffers_sizes[0] + +channel_buffers_sizes[1] + }; WINML_EXPECT_HRESULT_SUCCEEDED(binding->bind_as_references( input_name, diff --git a/winml/test/api/RawApiTestsGpu.cpp b/winml/test/api/RawApiTestsGpu.cpp index 9c1c06a01603f..f12ba0f36cebf 100644 --- a/winml/test/api/RawApiTestsGpu.cpp +++ b/winml/test/api/RawApiTestsGpu.cpp @@ -165,7 +165,8 @@ const RawApiTestsGpuApi& getapi() { CreateDirectXMinPowerDevice, Evaluate, EvaluateNoInputCopy, - EvaluateManyBuffers}; + EvaluateManyBuffers + }; if (SkipGpuTests()) { api.CreateDirectXDevice = SkipTest; diff --git a/winml/test/concurrency/ConcurrencyTests.cpp b/winml/test/concurrency/ConcurrencyTests.cpp index 46528ef70d377..404afbf67ea1c 100644 --- a/winml/test/concurrency/ConcurrencyTests.cpp +++ b/winml/test/concurrency/ConcurrencyTests.cpp @@ -141,7 +141,8 @@ void EvalAsyncDifferentBindings() { std::vector evaluation_units(num_units, EvaluationUnit()); std::vector ifvs = { - FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")}; + FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png") + }; // same session, different binding auto model = LearningModel::LoadFromFilePath(FileHelpers::GetModulePath() + L"model.onnx"); @@ -191,7 +192,8 @@ void MultiThreadMultiSessionOnDevice(const LearningModelDevice& device) { auto path = FileHelpers::GetModulePath() + L"model.onnx"; auto model = LearningModel::LoadFromFilePath(path); std::vector ivfs = { - FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")}; + FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png") + }; std::vector max_indices = { 281, // tabby, tabby cat 0 // tench, Tinca tinca @@ -257,7 +259,8 @@ void MultiThreadSingleSessionOnDevice(const LearningModelDevice& device) { LearningModelSession model_session = nullptr; WINML_EXPECT_NO_THROW(model_session = LearningModelSession(model, device)); std::vector ivfs = { - FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png")}; + FileHelpers::LoadImageFeatureValue(L"kitten_224.png"), FileHelpers::LoadImageFeatureValue(L"fish.png") + }; std::vector max_indices = { 281, // tabby, tabby cat 0 // tench, Tinca tinca @@ -322,7 +325,8 @@ const ConcurrencyTestsApi& getapi() { MultiThreadSingleSessionGpu, EvalAsyncDifferentModels, EvalAsyncDifferentSessions, - EvalAsyncDifferentBindings}; + EvalAsyncDifferentBindings + }; if (SkipGpuTests()) { api.MultiThreadMultiSessionGpu = SkipTest; diff --git a/winml/test/image/imageTestHelper.cpp b/winml/test/image/imageTestHelper.cpp index b7c1eb42965f8..91eed2a807782 100644 --- a/winml/test/image/imageTestHelper.cpp +++ b/winml/test/image/imageTestHelper.cpp @@ -148,7 +148,8 @@ TensorFloat LoadInputImageFromGPU(SoftwareBitmap softwareBitmap, const std::wstr // 3 is number of channels we use. R G B without alpha. UINT64 bufferbytesize = 3 * sizeof(float) * softwareBitmap.PixelWidth() * softwareBitmap.PixelHeight(); D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; D3D12_RESOURCE_DESC resourceDesc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp index 6157520ca96a3..2251954c59e4c 100644 --- a/winml/test/image/imagetests.cpp +++ b/winml/test/image/imagetests.cpp @@ -939,7 +939,8 @@ TEST_F(ImageTests, ImageBindingAsGPUTensor) { UINT64 buffer_byte_size = static_cast(software_bitmap.PixelWidth()) * software_bitmap.PixelHeight() * 3 * sizeof(float); D3D12_HEAP_PROPERTIES heap_properties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; D3D12_RESOURCE_DESC resource_desc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp index cb5cbbecb5ef0..f40f08ad2696d 100644 --- a/winml/test/model/model_tests.cpp +++ b/winml/test/model/model_tests.cpp @@ -232,7 +232,8 @@ static std::vector GetAllTestCases() { ORT_TSTR("tf_resnet_v2_152"), ORT_TSTR("vgg19"), ORT_TSTR("yolov3"), - ORT_TSTR("zfnet512")}; + ORT_TSTR("zfnet512") + }; allDisabledTests.insert(std::begin(x86DisabledTests), std::end(x86DisabledTests)); #endif // Bad onnx test output caused by previously wrong SAME_UPPER/SAME_LOWER for ConvTranspose diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h index 174f57143ee81..cf55d8bcbae7e 100644 --- a/winml/test/model/skip_model_tests.h +++ b/winml/test/model/skip_model_tests.h @@ -161,10 +161,8 @@ std::unordered_map> disabledGpu test name -> absolute difference sampleTolerance */ std::unordered_map sampleTolerancePerTests({ - {"fp16_inception_v1_opset7_GPU",0.005 }, - {"fp16_inception_v1_opset8_GPU", 0.005}, - { "candy_opset9_GPU", - 0.00150000 }, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/ - { "fp16_tiny_yolov2_opset8_GPU", - 0.109000 }, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/ + {"fp16_inception_v1_opset7_GPU", 0.005}, + {"fp16_inception_v1_opset8_GPU", 0.005}, + { "candy_opset9_GPU", 0.00150000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/ + { "fp16_tiny_yolov2_opset8_GPU", 0.109000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/ }); diff --git a/winml/test/scenario/cppwinrt/CustomNullOp.h b/winml/test/scenario/cppwinrt/CustomNullOp.h index 33709c5f72d3c..b50909548a6bf 100644 --- a/winml/test/scenario/cppwinrt/CustomNullOp.h +++ b/winml/test/scenario/cppwinrt/CustomNullOp.h @@ -69,7 +69,8 @@ struct NullOperatorFactory : winrt::implements allowedEdges{ CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double), CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float), - CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)}; + CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16) + }; typeConstraint.allowedTypes = allowedEdges.data(); typeConstraint.allowedTypeCount = static_cast(allowedEdges.size()); diff --git a/winml/test/scenario/cppwinrt/CustomOps.cpp b/winml/test/scenario/cppwinrt/CustomOps.cpp index 075bf5ed877a3..58d0fe6e64efc 100644 --- a/winml/test/scenario/cppwinrt/CustomOps.cpp +++ b/winml/test/scenario/cppwinrt/CustomOps.cpp @@ -305,7 +305,8 @@ static void CustomKernelWithBuiltInSchema() { // Register the kernel MLOperatorEdgeDescription floatTensorType = { - MLOperatorEdgeType::Tensor, static_cast(MLOperatorTensorDataType::Float)}; + MLOperatorEdgeType::Tensor, static_cast(MLOperatorTensorDataType::Float) + }; MLOperatorEdgeTypeConstrant constraint = {"T", &floatTensorType, 1}; @@ -318,7 +319,8 @@ static void CustomKernelWithBuiltInSchema() { 1, nullptr, 0, - MLOperatorKernelOptions::AllowDynamicInputShapes}; + MLOperatorKernelOptions::AllowDynamicInputShapes + }; Microsoft::WRL::ComPtr factory = wil::MakeOrThrow(CreateABIFooKernel); @@ -614,7 +616,8 @@ static void CustomKernelWithCustomSchema() { MLOperatorEdgeTypeConstrant kernelConstraint = {"T1", &floatTensorEdgeDesc, 1}; MLOperatorKernelDescription kernelDesc = { - "", "Foo", 7, MLOperatorExecutionType::Cpu, &kernelConstraint, testCases[caseIndex].useTypeLabel ? 1u : 0u}; + "", "Foo", 7, MLOperatorExecutionType::Cpu, &kernelConstraint, testCases[caseIndex].useTypeLabel ? 1u : 0u + }; if (!testCases[caseIndex].attributeDefaultsInSchema) { kernelDesc.defaultAttributes = defaultAttributes; @@ -693,10 +696,8 @@ static void CustomKernelWithCustomSchema() { const CustomOpsTestsApi& getapi() { static CustomOpsTestsApi api = { - CustomOpsScenarioTestsClassSetup, - CustomOperatorFusion, - CustomKernelWithBuiltInSchema, - CustomKernelWithCustomSchema}; + CustomOpsScenarioTestsClassSetup, CustomOperatorFusion, CustomKernelWithBuiltInSchema, CustomKernelWithCustomSchema + }; if (SkipGpuTests()) { api.CustomOperatorFusion = SkipTest; diff --git a/winml/test/scenario/cppwinrt/NoisyReluCpu.h b/winml/test/scenario/cppwinrt/NoisyReluCpu.h index 5f89b20beebb9..5cccbae67407c 100644 --- a/winml/test/scenario/cppwinrt/NoisyReluCpu.h +++ b/winml/test/scenario/cppwinrt/NoisyReluCpu.h @@ -157,7 +157,8 @@ struct NoisyReluOperatorFactory : winrt::implements allowedEdges{ CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double), CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float), - CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)}; + CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16) + }; typeConstraint.allowedTypes = allowedEdges.data(); typeConstraint.allowedTypeCount = static_cast(allowedEdges.size()); @@ -194,7 +195,8 @@ struct NoisyReluOperatorFactory : winrt::implements attributeDefaultValues{ - noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue}; + noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue + }; noisyReluSchema.defaultAttributes = attributeDefaultValues.data(); noisyReluSchema.defaultAttributeCount = static_cast(attributeDefaultValues.size()); @@ -216,7 +218,8 @@ struct NoisyReluOperatorFactory : winrt::implements allowedEdges{ CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double), CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float), - CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)}; + CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16) + }; typeConstraint.allowedTypes = allowedEdges.data(); typeConstraint.allowedTypeCount = static_cast(allowedEdges.size()); @@ -239,7 +242,8 @@ struct NoisyReluOperatorFactory : winrt::implements attributeDefaultValues{ - noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue}; + noisyReluMeanAttributeValue, noisyReluVarianceAttributeValue + }; kernelDescription.defaultAttributes = attributeDefaultValues.data(); kernelDescription.defaultAttributeCount = static_cast(attributeDefaultValues.size()); kernelDescription.options = MLOperatorKernelOptions::None; diff --git a/winml/test/scenario/cppwinrt/ReluCpu.h b/winml/test/scenario/cppwinrt/ReluCpu.h index c72285a4de7fb..7bb275f7b399b 100644 --- a/winml/test/scenario/cppwinrt/ReluCpu.h +++ b/winml/test/scenario/cppwinrt/ReluCpu.h @@ -114,7 +114,8 @@ struct ReluOperatorFactory : winrt::implements allowedEdges{ CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Double), CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float), - CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16)}; + CreateEdgeDescriptor(MLOperatorEdgeType::Tensor, MLOperatorTensorDataType::Float16) + }; typeConstraint.allowedTypes = allowedEdges.data(); typeConstraint.allowedTypeCount = static_cast(allowedEdges.size()); diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index 9b389d014c953..9a03172340bf7 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -510,7 +510,8 @@ static void Scenario9LoadBindEvalInputTensorGPU() { UINT64 bufferbytesize = 720 * 720 * 3 * sizeof(float); D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; D3D12_RESOURCE_DESC resourceDesc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, @@ -983,7 +984,8 @@ static void Scenario22ImageBindingAsGPUTensor() { // 3 is number of channels we use. R G B without alpha. UINT64 bufferbytesize = 3 * sizeof(float) * softwareBitmap.PixelWidth() * softwareBitmap.PixelHeight(); D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0 + }; D3D12_RESOURCE_DESC resourceDesc = { D3D12_RESOURCE_DIMENSION_BUFFER, 0, @@ -1085,7 +1087,8 @@ static void Scenario23NominalPixelRange() { std::vector modelPaths = {// Normalized_0_1 and image output modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", // Normalized_1_1 and image output - modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"}; + modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx" + }; for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) { // load model and create session From 885bf3561da143ad90d30b1d5959a9b065f5a6c9 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Thu, 9 Nov 2023 10:12:57 +1000 Subject: [PATCH 10/13] Add tool to fix lines > 120 chars. (#18293) ### Description Helper to run clang-format on lines that are > 120 chars. We disable clang-format enforcing 120 chars by default because it's formatting can negatively impact readability. If a developer has not manually kept a line within the 120 char limit this tool will fix it. It will leave all other lines alone to honor the formatting the developer chose. ### Motivation and Context Help developers fix lint errors. Preferred is to use a vertical ruler/guideline in your editor when actually writing the code. --- tools/python/fix_long_lines.py | 134 +++++++++++++++++++++++++++++++++ tools/python/util/logger.py | 4 +- 2 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 tools/python/fix_long_lines.py diff --git a/tools/python/fix_long_lines.py b/tools/python/fix_long_lines.py new file mode 100644 index 0000000000000..383fdc9623551 --- /dev/null +++ b/tools/python/fix_long_lines.py @@ -0,0 +1,134 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import argparse +import logging +import os +import pathlib +import shutil +import tempfile + +from util import logger, run + +_log = logger.get_logger("fix_long_lines", logging.INFO) + + +# look for long lines in the file, and if found run clang-format on those lines +def _process_files(filenames, clang_exe, tmpdir): + for path in filenames: + _log.debug(f"Checking {path}") + bad_lines = [] + + with open(path, encoding="UTF8") as f: + line_num = 0 + for line in f: + line_num += 1 # clang-format line numbers start at 1 + if len(line) > 120: + bad_lines.append(line_num) + + if bad_lines: + _log.info(f"Updating {path}") + filename = os.path.basename(path) + target = os.path.join(tmpdir, filename) + shutil.copy(path, target) + + # run clang-format to update just the long lines in the file + cmd = [ + clang_exe, + "-i", + ] + for line in bad_lines: + cmd.append(f"--lines={line}:{line}") + + cmd.append(target) + + run(*cmd, cwd=tmpdir, check=True, shell=True) + + # copy updated file back to original location + shutil.copy(target, path) + + +# file extensions we process +_EXTENSIONS = [".cc", ".h"] + + +def _get_branch_diffs(ort_root, branch): + command = ["git", "diff", branch, "--name-only"] + result = run(*command, capture_stdout=True, check=True) + + # stdout is bytes. one filename per line. decode, split, and filter to the extensions we are looking at + for f in result.stdout.decode("utf-8").splitlines(): + if os.path.splitext(f.lower())[-1] in _EXTENSIONS: + yield os.path.join(ort_root, f) + + +def _get_file_list(path): + for root, _, files in os.walk(path): + for file in files: + if os.path.splitext(file.lower())[-1] in _EXTENSIONS: + yield os.path.join(root, file) + + +def main(): + argparser = argparse.ArgumentParser( + "Script to fix long lines in the source using clang-format. " + "Only lines that exceed the 120 character maximum are altered in order to minimize the impact. " + "Checks .cc and .h files", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + argparser.add_argument( + "--branch", + type=str, + default="origin/main", + help="Limit changes to files that differ from this branch. Use origin/main when preparing a PR.", + ) + + argparser.add_argument( + "--all_files", + action="store_true", + help="Process all files under /include/onnxruntime and /onnxruntime/core. Ignores --branch value.", + ) + + argparser.add_argument( + "--clang-format", + type=pathlib.Path, + required=False, + default="clang-format", + help="Path to clang-format executable", + ) + + argparser.add_argument("--debug", action="store_true", help="Set log level to DEBUG.") + + args = argparser.parse_args() + + if args.debug: + _log.setLevel(logging.DEBUG) + + script_dir = os.path.dirname(os.path.realpath(__file__)) + ort_root = os.path.abspath(os.path.join(script_dir, "..", "..")) + + with tempfile.TemporaryDirectory() as tmpdir: + # create config in tmpdir + with open(os.path.join(tmpdir, ".clang-format"), "w") as f: + f.write( + """ + BasedOnStyle: Google + ColumnLimit: 120 + DerivePointerAlignment: false + """ + ) + + clang_format = str(args.clang_format) + + if args.all_files: + include_path = os.path.join(ort_root, "include", "onnxruntime") + src_path = os.path.join(ort_root, "onnxruntime", "core") + _process_files(_get_file_list(include_path), clang_format, tmpdir) + _process_files(_get_file_list(src_path), clang_format, tmpdir) + else: + _process_files(_get_branch_diffs(ort_root, args.branch), clang_format, tmpdir) + + +if __name__ == "__main__": + main() diff --git a/tools/python/util/logger.py b/tools/python/util/logger.py index 15e04528ac7ac..d6f3026959daa 100644 --- a/tools/python/util/logger.py +++ b/tools/python/util/logger.py @@ -4,8 +4,8 @@ import logging -def get_logger(name): +def get_logger(name, level=logging.DEBUG): logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s") logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) + logger.setLevel(level) return logger From 4dc63692f83ad1217e0964f6efbf78f4c83dfb60 Mon Sep 17 00:00:00 2001 From: guyang3532 <62738430+guyang3532@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:52:48 +0800 Subject: [PATCH 11/13] Add FlattenAndUnpad Op (#17845) ### Description Add an op named `FlattenAndUnpad`. This op implements functions: 1. Flatten the first two dims of input tensor. 2. Gather valid value from input tensor with index tensor,. ### Motivation and Context The grad op of `PadAndUnflatten` was `GatherGrad` which is inefficient in performance. I implement this `FlattenAndUnpad` just to replace the `GatherGrad` as grad of `PadAndUnflatten`. With this op, we also can simplify the "Reshape + ShrunkenGather" pattern to `PadAndUnflatten` in padding elimination optimizer, which will also improve performance. --- .../core/graph/gradient_builder.cc | 17 +- .../orttraining/core/graph/gradient_builder.h | 1 + .../core/graph/gradient_builder_registry.cc | 1 + .../core/graph/training_op_defs.cc | 26 ++- .../compute_optimizer/padding_elimination.cc | 90 +++------- .../test/gradient/gradient_ops_test.cc | 3 +- .../python/orttraining_test_ortmodule_api.py | 12 +- .../cuda/flatten_and_unpad_test.cc | 157 ++++++++++++++++++ .../cuda/pad_and_unflatten_test.cc | 12 -- .../cuda/cuda_training_kernels.cc | 2 + .../cuda/tensor/flatten_and_unpad.cc | 91 ++++++++++ .../cuda/tensor/flatten_and_unpad.h | 21 +++ .../cuda/tensor/flatten_and_unpad_impl.cu | 83 +++++++++ .../cuda/tensor/flatten_and_unpad_impl.h | 25 +++ .../cuda/tensor/pad_and_unflatten.cc | 11 +- .../cuda/tensor/pad_and_unflatten_impl.cu | 12 +- .../rocm/rocm_training_kernels.cc | 2 + 17 files changed, 448 insertions(+), 118 deletions(-) create mode 100644 orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu create mode 100644 orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index 7100cedaf78a0..755a8e49d9d12 100755 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -791,13 +791,16 @@ IMPLEMENT_GRADIENT_BUILDER(GetGatherGradient) { IMPLEMENT_GRADIENT_BUILDER(GetPadAndUnflattenGradient) { return std::vector{ - NodeDef(OpDef("Reshape"), - {GO(0), O(1)}, - {IA("GO_reshaped")}), - NodeDef(OpDef{"Gather", kOnnxDomain, 1}, - {IA("GO_reshaped"), I(1)}, - {GI(0)}, - SrcNodeAttributes())}; + NodeDef(OpDef{"FlattenAndUnpad", kMSDomain, 1}, + {GO(0), I(1)}, + {GI(0), IA("Unflatten_dims")})}; +} + +IMPLEMENT_GRADIENT_BUILDER(GetFlattenAndUnpadGradient) { + return std::vector{ + NodeDef(OpDef{"PadAndUnflatten", kMSDomain, 1}, + {GO(0), I(1), O(1)}, + {GI(0)})}; } IMPLEMENT_GRADIENT_BUILDER(GetShrunkenGatherGradient) { diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h index 08987a86ebda9..92bfae9cd83a4 100755 --- a/orttraining/orttraining/core/graph/gradient_builder.h +++ b/orttraining/orttraining/core/graph/gradient_builder.h @@ -40,6 +40,7 @@ DECLARE_GRADIENT_BUILDER(GetAveragePoolGradient) DECLARE_GRADIENT_BUILDER(GetMaxPoolGradient) DECLARE_GRADIENT_BUILDER(GetGatherGradient) DECLARE_GRADIENT_BUILDER(GetPadAndUnflattenGradient) +DECLARE_GRADIENT_BUILDER(GetFlattenAndUnpadGradient) DECLARE_GRADIENT_BUILDER(GetShrunkenGatherGradient) DECLARE_GRADIENT_BUILDER(GetConvGradient) DECLARE_GRADIENT_BUILDER(GetUnsqueezeGradient) diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc index f280a02cb490f..ea56be9e6dfa3 100755 --- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc +++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc @@ -72,6 +72,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() { REGISTER_GRADIENT_BUILDER("MaxPool", GetMaxPoolGradient); REGISTER_GRADIENT_BUILDER("Gather", GetGatherGradient); REGISTER_GRADIENT_BUILDER("PadAndUnflatten", GetPadAndUnflattenGradient); + REGISTER_GRADIENT_BUILDER("FlattenAndUnpad", GetFlattenAndUnpadGradient); REGISTER_GRADIENT_BUILDER("ShrunkenGather", GetShrunkenGatherGradient); REGISTER_GRADIENT_BUILDER("Conv", GetConvGradient); REGISTER_GRADIENT_BUILDER("Squeeze", GetSqueezeGradient); diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc index 283883c2e33c6..8d3f76be20c65 100644 --- a/orttraining/orttraining/core/graph/training_op_defs.cc +++ b/orttraining/orttraining/core/graph/training_op_defs.cc @@ -4741,7 +4741,7 @@ Return true if all elements are true and false otherwise. "For other indices, the corresponding value in output will be padded to zero." "The indices don't allow duplicated index values, otherwise, though there is no runtime check" - "(in case of performance concern), the behaviour of output is undefined." + "(in case of performance concern), the behavior of output is undefined." "An example:" " input: [[1, 2, 3, 4], [5, 6, 7, 8]], shape is [2, 4]" @@ -4749,14 +4749,12 @@ Return true if all elements are true and false otherwise. " unflatten_dims: [2, 3], shape is [2]" " output: [[[1, 2, 3, 4], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [5, 6, 7, 8]]]," - " shape is [2, 3, 4]" - " flatten_output_shape: [6, 4], shape is [2]") + " shape is [2, 3, 4]") .Input(0, "input", "input data of rank N, shape is [d1, d2, ..., dN]", "T") .Input(1, "indices", "1D Tensor of int32/int64 indices, shape is [d1], each element's value ranges in [0, M1*M2).", "T_INDEX") .Input(2, "unflatten_dims", "1D tensor with two values, [M1, M2].", "T_INT") .Output(0, "output", "output data of rank N+1, [M1, M2, d2, ..., dN]", "T") - .Output(1, "flatten_output_shape", "1D tensor with output shape, [M1*M2, d2, ..., dN]", "T_INT") .TypeConstraint( "T_INT", {"tensor(int32)", "tensor(int64)"}, @@ -4770,6 +4768,26 @@ Return true if all elements are true and false otherwise. {"tensor(int32)", "tensor(int64)"}, "Constrain indices to integer types"); + ONNX_CONTRIB_OPERATOR_SCHEMA(FlattenAndUnpad) + .SetDomain(kMSDomain) + .SinceVersion(1) + .SetDoc( + "FlattenAndUnpad operator flattens the first two dims of input tensor, and unpad according to given indices." + "This is used by padding elimination graph transformer.") + .Input(0, "input", "input data of rank N + 1, shape is [M1, M2, d2, ..., dN]", "T") + .Input(1, "indices", "1D Tensor of int32/int64 indices, shape is [d1], each element's value ranges in [0, M1*M2).", + "T_INT") + .Output(0, "output", "output data of rank N, [d1, d2, ..., dN]", "T") + .Output(1, "unflatten_dims", "1D tensor with two values, [M1, M2].", "T_INT") + .TypeConstraint( + "T_INT", + {"tensor(int32)", "tensor(int64)"}, + "Constrain indices and shape to integer tensors.") + .TypeConstraint( + "T", + {"tensor(int32)", "tensor(int64)", "tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"}, + "Constrain input and output types to float tensors."); + ONNX_CONTRIB_OPERATOR_SCHEMA(GRUTraining) .SetDomain(kMSDomain) .SinceVersion(1) diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc index 74247c059cf84..73638e8ba62a0 100644 --- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc +++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc @@ -129,91 +129,43 @@ NodeArg* InsertExpandForNodeInput(Graph& graph, return new_expand_node->MutableOutputDefs()[0]; } -// Insert Reshape + ShrunkenGather to flatten the in_index-th input of node. +// Insert FlattenAndUnpad to flatten and unpad the in_index-th input of node. // The gather_index_arg is the indices of the elements that are not padding. NodeArg* InsertFlattenPatternForInput(Graph& graph, Node& node, uint32_t in_index, NodeArg* gather_index_arg, const logging::Logger& logger) { - InlinedVector reshape_input_args; - reshape_input_args.reserve(2); - reshape_input_args.push_back(node.MutableInputDefs()[in_index]); - std::vector new_shape; - new_shape.push_back(-1); // only support flatten 0 and 1 dims - auto input_shape = node.InputDefs()[in_index]->Shape(); - ORT_ENFORCE(input_shape->dim_size() >= 2); - ONNX_NAMESPACE::TensorShapeProto flattened_shape; - if (input_shape->dim(0).has_dim_value() && input_shape->dim(1).has_dim_value()) { - flattened_shape.add_dim()->set_dim_value(input_shape->dim(0).dim_value() * input_shape->dim(1).dim_value()); - } else { - std::string token_dim_name = MakeString("total_token_count_", utils::GetRandomSeed()); - flattened_shape.add_dim()->set_dim_param(token_dim_name); - } - for (int k = 2; k < input_shape->dim_size(); k++) { - ORT_ENFORCE(input_shape->dim(k).has_dim_value()); - new_shape.push_back(input_shape->dim(k).dim_value()); - flattened_shape.add_dim()->set_dim_value(input_shape->dim(k).dim_value()); - } - ONNX_NAMESPACE::TensorProto new_shape_const_tensor; - new_shape_const_tensor.set_name(graph.GenerateNodeArgName("new_shape")); - new_shape_const_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); - new_shape_const_tensor.add_dims(new_shape.size()); - new_shape_const_tensor.set_raw_data(new_shape.data(), new_shape.size() * sizeof(int64_t)); - NodeArg* new_shape_arg = &graph_utils::AddInitializer(graph, new_shape_const_tensor); - reshape_input_args.push_back(new_shape_arg); - - InlinedVector reshape_output_args; - reshape_output_args.push_back( - &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("inputs_reshape_result"), - node.MutableInputDefs()[in_index]->TypeAsProto())); - - Node* new_reshape_node = InsertIntermediateNodeOnDestInput( - graph, node, - in_index, - 0, - 0, - graph.GenerateNodeName("Reshape"), - "Reshape", - "Reshape node to filter invalid tokens.", - reshape_input_args, - reshape_output_args, - {}, - "", - logger); + InlinedVector unpad_input_args; + unpad_input_args.reserve(2); + unpad_input_args.push_back(node.MutableInputDefs()[in_index]); + unpad_input_args.push_back(gather_index_arg); - new_reshape_node->SetExecutionProviderType(node.GetExecutionProviderType()); - auto reshape_out_arg = new_reshape_node->MutableOutputDefs()[0]; - - reshape_out_arg->SetShape(flattened_shape); - - InlinedVector gather_input_args; - gather_input_args.reserve(2); - gather_input_args.push_back(reshape_output_args[0]); - gather_input_args.push_back(gather_index_arg); - - InlinedVector gather_output_args; - gather_output_args.push_back( + InlinedVector unpad_output_args; + unpad_output_args.push_back( &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padding_filter_result"), - reshape_out_arg->TypeAsProto())); + nullptr)); + unpad_output_args.push_back( + &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("d1_d2_shape"), + nullptr)); - Node* new_gather_node = InsertIntermediateNodeOnDestInput( + Node* unpad_node = InsertIntermediateNodeOnDestInput( graph, node, in_index, 0, 0, graph.GenerateNodeName("PaddingFilter"), - "ShrunkenGather", - "ShrunkenGather node to filter invalid tokens.", - gather_input_args, - gather_output_args, + "FlattenAndUnpad", + "FlattenAndUnpad node to filter invalid tokens.", + unpad_input_args, + unpad_output_args, {}, kMSDomain, logger); - new_gather_node->SetExecutionProviderType(node.GetExecutionProviderType()); - auto gather_out_arg = new_gather_node->MutableOutputDefs()[0]; - return gather_out_arg; + unpad_node->SetExecutionProviderType(node.GetExecutionProviderType()); + auto unpad_out_arg = unpad_node->MutableOutputDefs()[0]; + return unpad_out_arg; } // Insert PadAndUnflatten to unflatten the shape of the in_index-th input of node. @@ -236,10 +188,6 @@ NodeArg* InsertNodesForOutput(Graph& graph, pad_node_output_args.push_back( &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padded_result"), nullptr)); - pad_node_output_args.push_back( - &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("padded_d1xd2_shape"), - nullptr)); - Node* new_gathergrad_node = InsertIntermediateNodeOnDestInput( graph, node, in_index, diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 890a1bbccbc92..6fb42dd59b6a0 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -3011,7 +3011,6 @@ TEST(GradientCheckerTest, PadAndUnflattenGrad) { std::vector> x_datas = {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {3, 5, 0, 1}, {5, 2}}; TensorInfo padded_out_info({5, 2, 3}, true); - TensorInfo out_shape_info({2}, false, nullptr, DataTypeImpl::GetTensorType()); std::vector> execution_providers; #ifdef USE_CUDA @@ -3021,7 +3020,7 @@ TEST(GradientCheckerTest, PadAndUnflattenGrad) { #endif ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indices_info, shape_info}, - {padded_out_info, out_shape_info}, &max_error, + {padded_out_info}, &max_error, x_datas, {}, true, false, &execution_providers)); EXPECT_IS_TINY(max_error); } diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index c8ec2e52f3078..13024b81f4b3c 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -5786,14 +5786,14 @@ def __init__(self, vocab_size, hidden_size, pad_token_id): # the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be # added to output of test_op. # in case 2, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, 1, hidden_size], - # the test_op should be included in padding elimination subgraph and a 'Expand + Reshape + ShrunkenGather' + # the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad' # pattern should be insert to the arg of [batch_size, 1, hidden_size]. # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size], - # the test_op should be included in padding elimination subgraph and a 'Expand + Reshape + ShrunkenGather' + # the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad' # pattern should be insert to the arg of [batch_size, 1, hidden_size]. # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size], # the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to - # output of test_op. Besides, the other input of Add should be added 'Reshape + ShrunkenGather' to + # output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to # flatten and elimination padding. def test_elementwise(self, input_ids): input_shape = input_ids.size() @@ -5905,9 +5905,9 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): assert len([node.op_type for node in training_model.graph.node if node.op_type == "Squeeze"]) == 1 assert len([node.op_type for node in training_model.graph.node if node.op_type == "PadAndUnflatten"]) == 1 if case >= 2: - assert len([node.op_type for node in training_model.graph.node if node.op_type == "ShrunkenGather"]) == 2 + assert len([node.op_type for node in training_model.graph.node if node.op_type == "FlattenAndUnpad"]) == 3 else: - assert len([node.op_type for node in training_model.graph.node if node.op_type == "ShrunkenGather"]) == 1 + assert len([node.op_type for node in training_model.graph.node if node.op_type == "FlattenAndUnpad"]) == 2 gathergrad_node = next(node for node in training_model.graph.node if node.op_type == "PadAndUnflatten") def find_input_node_type(model, arg): @@ -6071,7 +6071,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-4) training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model - assert "ShrunkenGather" in [node.op_type for node in training_model.graph.node] + assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node] assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node] del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] diff --git a/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc b/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc new file mode 100644 index 0000000000000..dd5fa18ab3edd --- /dev/null +++ b/orttraining/orttraining/test/training_ops/cuda/flatten_and_unpad_test.cc @@ -0,0 +1,157 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test/common/tensor_op_test_utils.h" +#include "test/providers/provider_test_utils.h" + +namespace onnxruntime { +namespace test { + +#if defined(USE_CUDA) || defined(USE_ROCM) + +TEST(FlattenAndUnpadTest, Int32Type2D) { + std::vector input = {1, 1, 3, 2, 0, 3, 0, 4, + 0, 5, 0, 6, 0, 0, 0}; + std::vector indices = {1, 3, 5, 7, 9, 11}; + + std::vector output = {1, 2, 3, 4, 5, 6}; + std::vector unflatten_dims = {5, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {5, 3}, input); + test.AddInput("indices", {6}, indices); + test.AddOutput("output", {6}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, Int32Type3D) { + std::vector input = {0, 0, 0, 1, 2, 3, 0, 0, 0, + 4, 5, 6, 7, 8, 9, 0, 0, 0}; + std::vector indices = {1, 3, 4}; + + std::vector output = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector unflatten_dims = {2, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {2, 3, 3}, input); + test.AddInput("indices", {3}, indices); + test.AddOutput("output", {3, 3}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, Int64Type2D) { + std::vector input = {1, 1, 3, 2, 0, 3, 0, 4, + 0, 5, 0, 6, 0, 0, 0}; + std::vector indices = {1, 3, 5, 7, 9, 11}; + + std::vector output = {1, 2, 3, 4, 5, 6}; + std::vector unflatten_dims = {5, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {5, 3}, input); + test.AddInput("indices", {6}, indices); + test.AddOutput("output", {6}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, Int64Type3D) { + std::vector input = {0, 0, 0, 1, 2, 3, 0, 0, 0, + 4, 5, 6, 7, 8, 9, 0, 0, 0}; + std::vector indices = {1, 3, 4}; + + std::vector output = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector unflatten_dims = {2, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {2, 3, 3}, input); + test.AddInput("indices", {3}, indices); + test.AddOutput("output", {3, 3}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, FloatType2D) { + std::vector input = {1.0f, 1.0f, 3.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f, + 0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f}; + std::vector indices = {1, 3, 5, 7, 9, 11}; + + std::vector output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f}; + std::vector unflatten_dims = {5, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {5, 3}, input); + test.AddInput("indices", {6}, indices); + test.AddOutput("output", {6}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, FloatType3D) { + std::vector input = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, + 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f}; + std::vector indices = {1, 3, 4}; + + std::vector output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f, 7.f, 8.f, 9.f}; + std::vector unflatten_dims = {2, 3}; + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {2, 3, 3}, input); + test.AddInput("indices", {3}, indices); + test.AddOutput("output", {3, 3}, output); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, MLFloat16Type2D) { + std::vector input = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f, + 0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f}; + std::vector indices = {1, 3, 5, 7, 9, 11}; + + std::vector output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f}; + std::vector unflatten_dims = {5, 3}; + + std::vector input_half; + input_half.resize(input.size()); + ConvertFloatToMLFloat16(input.data(), input_half.data(), static_cast(input.size())); + std::vector output_half; + output_half.resize(output.size()); + ConvertFloatToMLFloat16(output.data(), output_half.data(), static_cast(output.size())); + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {5, 3}, input_half); + test.AddInput("indices", {6}, indices); + test.AddOutput("output", {6}, output_half); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +TEST(FlattenAndUnpadTest, MLFloat16Type3D) { + std::vector input = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, + 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f}; + std::vector indices = {1, 3, 4}; + + std::vector output = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.f, 7.f, 8.f, 9.f}; + std::vector unflatten_dims = {2, 3}; + + std::vector input_half; + input_half.resize(input.size()); + ConvertFloatToMLFloat16(input.data(), input_half.data(), static_cast(input.size())); + std::vector output_half; + output_half.resize(output.size()); + ConvertFloatToMLFloat16(output.data(), output_half.data(), static_cast(output.size())); + + OpTester test("FlattenAndUnpad", 1, onnxruntime::kMSDomain); + test.AddInput("input", {2, 3, 3}, input_half); + test.AddInput("indices", {3}, indices); + test.AddOutput("output", {3, 3}, output_half); + test.AddOutput("unflatten_dims", {2}, unflatten_dims); + test.Run(); +} + +#endif + +} // namespace test +} // namespace onnxruntime diff --git a/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc b/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc index a800f17e59ae0..9a86955e09379 100644 --- a/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/pad_and_unflatten_test.cc @@ -17,14 +17,11 @@ TEST(PadAndUnflattenTest, FloatType1D) { std::vector output = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f, 0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f}; - std::vector full_flatten_dims = {15}; - OpTester test("PadAndUnflatten", 1, onnxruntime::kMSDomain); test.AddInput("input", {6}, input); test.AddInput("indices", {6}, indices); test.AddInput("unflatten_dims", {2}, unflatten_dims); test.AddOutput("output", {5, 3}, output); - test.AddOutput("full_flatten_dims", {1}, full_flatten_dims); test.Run(); } @@ -36,14 +33,11 @@ TEST(PadAndUnflattenTest, FloatType2D) { std::vector output = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f}; - std::vector full_flatten_dims = {6, 3}; - OpTester test("PadAndUnflatten", 1, onnxruntime::kMSDomain); test.AddInput("input", {3, 3}, input); test.AddInput("indices", {3}, indices); test.AddInput("unflatten_dims", {2}, unflatten_dims); test.AddOutput("output", {2, 3, 3}, output); - test.AddOutput("full_flatten_dims", {2}, full_flatten_dims); test.Run(); } @@ -55,8 +49,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type1D) { std::vector output = {0.0f, 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f, 0.0f, 5.0f, 0.0f, 6.0f, 0.0f, 0.0f, 0.0f}; - std::vector full_flatten_dims = {15}; - std::vector input_half; input_half.resize(input.size()); ConvertFloatToMLFloat16(input.data(), input_half.data(), int(input.size())); @@ -69,7 +61,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type1D) { test.AddInput("indices", {6}, indices); test.AddInput("unflatten_dims", {2}, unflatten_dims); test.AddOutput("output", {5, 3}, output_half); - test.AddOutput("full_flatten_dims", {1}, full_flatten_dims); test.Run(); } @@ -81,8 +72,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type2D) { std::vector output = {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f}; - std::vector full_flatten_dims = {6, 3}; - std::vector input_half; input_half.resize(input.size()); ConvertFloatToMLFloat16(input.data(), input_half.data(), int(input.size())); @@ -95,7 +84,6 @@ TEST(PadAndUnflattenTest, MLFloat16Type2D) { test.AddInput("indices", {3}, indices); test.AddInput("unflatten_dims", {2}, unflatten_dims); test.AddOutput("output", {2, 3, 3}, output_half); - test.AddOutput("full_flatten_dims", {2}, full_flatten_dims); test.Run(); } diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc index eeaa51c4dc1d8..dcf733153bdad 100644 --- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc @@ -207,6 +207,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, FlattenAndUnpad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad); @@ -462,6 +463,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc new file mode 100644 index 0000000000000..8bfb10f686771 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.cc @@ -0,0 +1,91 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad.h" +#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h" +#include "core/providers/cuda/shared_inc/cuda_utils.h" + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_KERNEL_EX( + FlattenAndUnpad, + kMSDomain, + 1, + kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", BuildKernelDefConstraints()) + .TypeConstraint("T_INT", DataTypeImpl::GetTensorType()) + .OutputMemoryType(OrtMemTypeCPUOutput, 1), + FlattenAndUnpad); + +// Put implementation in the anonymous namespace to avoid name collision in the global namespace. +namespace { + +template +struct FlattenAndUnpadFunctor { + void operator()(cudaStream_t stream, + const int64_t output_element_count, + const fast_divmod output_element_stride_fdm, + const int64_t index_value_upper_bound, + const Tensor& input_tensor, + const Tensor& indices_tensor, + Tensor& output_tensor) const { + typedef typename ToCudaType::MappedType CudaT; + const CudaT* input_data = reinterpret_cast(input_tensor.Data()); + + FlattenAndUnpadImpl(stream, output_element_count, output_element_stride_fdm, index_value_upper_bound, + input_data, indices_tensor.Data(), + reinterpret_cast(output_tensor.MutableData())); + } +}; + +} // namespace + +Status FlattenAndUnpad::ComputeInternal(OpKernelContext* context) const { + const Tensor* input_tensor = context->Input(0); + const Tensor* indices_tensor = context->Input(1); + ORT_ENFORCE(input_tensor->Shape().NumDimensions() >= 2, + "input_tensor tensor must have at least 2 dimensions.", input_tensor->Shape().NumDimensions()); + ORT_ENFORCE(indices_tensor->Shape().NumDimensions() == 1, + "indices_tensor tensor must be 1-D.", indices_tensor->Shape().NumDimensions()); + + const auto& input_shape = input_tensor->Shape(); + std::vector output_shape_vec; + output_shape_vec.reserve(input_shape.NumDimensions() - 1); + output_shape_vec.push_back(indices_tensor->Shape()[0]); + int64_t element_stride = 1; + for (size_t i = 2; i < input_shape.NumDimensions(); ++i) { + output_shape_vec.push_back(input_shape[i]); + element_stride *= input_shape[i]; + } + + fast_divmod output_element_stride_fdm(static_cast(element_stride)); + auto output_shape = TensorShape(output_shape_vec); + Tensor* output_tensor = context->Output(0, output_shape); + + std::vector unflatten_dims_vec; + unflatten_dims_vec.reserve(2); + unflatten_dims_vec.push_back(input_shape[0]); + unflatten_dims_vec.push_back(input_shape[1]); + const int64_t index_value_upper_bound = input_shape[0] * input_shape[1]; + + utils::MLTypeCallDispatcher + t_disp(input_tensor->GetElementType()); + t_disp.Invoke(Stream(context), + output_shape.Size(), + output_element_stride_fdm, + index_value_upper_bound, + *input_tensor, + *indices_tensor, + *output_tensor); + + size_t rank = unflatten_dims_vec.size(); + Tensor* unflatten_dims_tensor = context->Output(1, {static_cast(rank)}); + TensorShape(unflatten_dims_vec).CopyDims(unflatten_dims_tensor->MutableData(), rank); + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h new file mode 100644 index 0000000000000..f9c6819a393b8 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad.h @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/cuda/cuda_kernel.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace cuda { + +class FlattenAndUnpad final : public CudaKernel { + public: + FlattenAndUnpad(const OpKernelInfo& info) : CudaKernel(info) { + } + + Status ComputeInternal(OpKernelContext* context) const override; +}; + +} // namespace cuda +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu new file mode 100644 index 0000000000000..2091a7082ee79 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.cu @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h" +#include "core/providers/cuda/cu_inc/common.cuh" + +namespace onnxruntime { +namespace cuda { + +constexpr int kBlockSize = 256; +constexpr int kNumUnroll = 4; + +template +__global__ void ExtractIputWithIndexKernel(const CUDA_LONG N, + const fast_divmod output_element_stride_fdm, + const int64_t index_value_upper_bound, + const T* input_data, + const int64_t* indices_data, + T* output_data) { + CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; + CUDA_LONG id = idx * kNumUnroll; + + T input[kNumUnroll]; + if (id < N) { +#pragma unroll + for (int i = 0; i < kNumUnroll; ++i) { + CUDA_LONG li = id + i; + if (li < N) { + int row_index, col_index; + output_element_stride_fdm.divmod(li, row_index, col_index); + assert(indices_data[row_index] < index_value_upper_bound); + input[i] = input_data[indices_data[row_index] * output_element_stride_fdm.d_ + col_index]; + } + } + } + +#pragma unroll + for (int i = 0; i < kNumUnroll; ++i) { + CUDA_LONG li = id + i; + if (li < N) { + output_data[li] = input[i]; + } + } +} + +template +void FlattenAndUnpadImpl(cudaStream_t stream, + const int64_t total_element_count, + const fast_divmod output_element_stride_fdm, + const int64_t index_value_upper_bound, + const T* input_data, + const int64_t* indices_data, + T* output_data) { + const int blocksPerGrid = static_cast(CeilDiv(total_element_count, kBlockSize * kNumUnroll)); + ExtractIputWithIndexKernel<<>>( + static_cast(total_element_count), + output_element_stride_fdm, + index_value_upper_bound, + input_data, + indices_data, + output_data); +} + +#define FLATTEN_AND_UNPAD_IMPL(T) \ + template void FlattenAndUnpadImpl(cudaStream_t stream, \ + const int64_t total_element_count, \ + const fast_divmod output_element_stride_fdm, \ + const int64_t index_value_upper_bound, \ + const T* input_data, \ + const int64_t* indices_data, \ + T* output_data); + +FLATTEN_AND_UNPAD_IMPL(float) +FLATTEN_AND_UNPAD_IMPL(double) +FLATTEN_AND_UNPAD_IMPL(half) +FLATTEN_AND_UNPAD_IMPL(BFloat16) +FLATTEN_AND_UNPAD_IMPL(int32_t) +FLATTEN_AND_UNPAD_IMPL(int64_t) + +#undef FLATTEN_AND_UNPAD_FROM_MASK_IMPL + +} // namespace cuda +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h new file mode 100644 index 0000000000000..75f8c243d3425 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/tensor/flatten_and_unpad_impl.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#ifdef USE_ROCM +#include "core/providers/rocm/shared_inc/rocm_utils.h" +#else +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#endif + +namespace onnxruntime { +namespace cuda { + +template +void FlattenAndUnpadImpl(cudaStream_t stream, + const int64_t total_element_count, + const fast_divmod output_element_stride_fdm, + const int64_t index_value_upper_bound, + const T* input_data, + const int64_t* indices_data, + T* output_data); + +} // namespace cuda +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc index caf89ef840e0c..7bd759e8976c1 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten.cc @@ -17,8 +17,7 @@ ONNX_OPERATOR_KERNEL_EX( .TypeConstraint("T", BuildKernelDefConstraints()) .TypeConstraint("T_INT", DataTypeImpl::GetTensorType()) .TypeConstraint("T_INDEX", DataTypeImpl::GetTensorType()) - .InputMemoryType(OrtMemTypeCPUInput, 2) - .OutputMemoryType(OrtMemTypeCPUOutput, 1), + .InputMemoryType(OrtMemTypeCPUInput, 2), PadAndUnflatten); // Put implementation in the anonymous namespace to avoid name collision in the global namespace. @@ -63,14 +62,11 @@ Status PadAndUnflatten::ComputeInternal(OpKernelContext* context) const { output_shape_vec.push_back(dims_ptr[0]); output_shape_vec.push_back(dims_ptr[1]); - std::vector full_size_flatten_shape_vec; const int64_t flatten_dim_factor = dims_ptr[0] * dims_ptr[1]; - full_size_flatten_shape_vec.push_back(flatten_dim_factor); int64_t element_stride = 1; for (size_t i = 1; i < input_shape.NumDimensions(); ++i) { output_shape_vec.push_back(input_shape[i]); - full_size_flatten_shape_vec.push_back(input_shape[i]); element_stride *= input_shape[i]; } @@ -87,11 +83,6 @@ Status PadAndUnflatten::ComputeInternal(OpKernelContext* context) const { *indices_tensor, *output_tensor); - // Set input shape output tensor. - size_t rank = full_size_flatten_shape_vec.size(); - Tensor* input_shape_tensor = context->Output(1, {static_cast(rank)}); - TensorShape(full_size_flatten_shape_vec).CopyDims(input_shape_tensor->MutableData(), rank); - return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu index 22a4f518dfa47..e96770f974bf0 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/pad_and_unflatten_impl.cu @@ -61,7 +61,7 @@ void PadAndUnflattenImpl(cudaStream_t stream, output_data); } -#define SPECIALIZED_RESTORE_FROM_MASK_IMPL(T) \ +#define PAD_AND_UNFLATTEN_IMPL(T) \ template void PadAndUnflattenImpl(cudaStream_t stream, \ const int64_t total_element_count, \ const fast_divmod output_element_stride_fdm, \ @@ -70,12 +70,12 @@ void PadAndUnflattenImpl(cudaStream_t stream, const int64_t* indices_data, \ T* output_data); -SPECIALIZED_RESTORE_FROM_MASK_IMPL(float) -SPECIALIZED_RESTORE_FROM_MASK_IMPL(double) -SPECIALIZED_RESTORE_FROM_MASK_IMPL(half) -SPECIALIZED_RESTORE_FROM_MASK_IMPL(BFloat16) +PAD_AND_UNFLATTEN_IMPL(float) +PAD_AND_UNFLATTEN_IMPL(double) +PAD_AND_UNFLATTEN_IMPL(half) +PAD_AND_UNFLATTEN_IMPL(BFloat16) -#undef SPECIALIZED_RESTORE_FROM_MASK_IMPL +#undef PAD_AND_UNFLATTEN_FROM_MASK_IMPL } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc index e0749c2fb4d0d..e107a2542fc0b 100644 --- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc +++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc @@ -187,6 +187,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, FlattenAndUnpad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad); @@ -390,6 +391,7 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From 7a3da4526f98c9cfc6387a5faa1edeec7d88ef17 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Wed, 8 Nov 2023 18:32:12 -0800 Subject: [PATCH 12/13] add bfloat16 support for CUDA Neg kernel (#18306) ### Description Registers BFloat16 datatype as valid input type for CUDA Neg Kernel. ### Motivation and Context Enabling `meta-llama/Llama-2-70b` to be finetuned with ONNX Runtime training. --------- Co-authored-by: Prathik Rao --- docs/OperatorKernels.md | 2 +- .../core/providers/cuda/cuda_execution_provider.cc | 2 ++ .../core/providers/cuda/math/unary_elementwise_ops.cc | 9 ++++++++- .../providers/cuda/math/unary_elementwise_ops_impl.cu | 9 +++++---- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 38783ac044c22..8e546b30aa4cb 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -665,7 +665,7 @@ Do not modify directly.* |Mul|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| |||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| |||[7, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| -|Neg|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)| +|Neg|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)| |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)| |NonZero|*in* X:**T**
*out* Y:**tensor(int64)**|13+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint8)| |||[9, 12]|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint8)| diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 2d242d7d6fb12..d8a0792209b0f 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -971,6 +971,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Neg); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Floor); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Floor); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Floor); @@ -1855,6 +1856,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc index 9ede1f8d90ecc..655877f425054 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc @@ -99,6 +99,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa // F: float // D: double // O: bool +// X: BFloat16 #define UNARY_OP_VERSIONED_HFD(name, startver, endver) \ UNARY_OP_VERSIONED_TYPED(name, startver, endver, MLFloat16) \ @@ -124,12 +125,18 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa UNARY_OP_TYPED(name, ver, float) \ UNARY_OP_TYPED(name, ver, double) +#define UNARY_OP_HFDX(name, ver) \ + UNARY_OP_TYPED(name, ver, MLFloat16) \ + UNARY_OP_TYPED(name, ver, BFloat16) \ + UNARY_OP_TYPED(name, ver, float) \ + UNARY_OP_TYPED(name, ver, double) + #define UNARY_OP_CSILHFD(name, ver) \ UNARY_OP_TYPED(name, ver, int8_t) \ UNARY_OP_TYPED(name, ver, int16_t) \ UNARY_OP_TYPED(name, ver, int32_t) \ UNARY_OP_TYPED(name, ver, int64_t) \ - UNARY_OP_HFD(name, ver) + UNARY_OP_HFDX(name, ver) #define UNARY_OP_BWUZCSILHFD(name, ver) \ UNARY_OP_TYPED(name, ver, uint8_t) \ diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu index 1298d53338337..5c3db4a499972 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu @@ -53,13 +53,14 @@ UNARY_OPS() // F: float // D: double // O: bool +// X: BFloat16 #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, half) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, float) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, double) -#define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(name) \ +#define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(name) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, BFloat16) @@ -68,7 +69,7 @@ UNARY_OPS() SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int16_t) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int32_t) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, int64_t) \ - SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(name) + SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(name) #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL_BWUZCSILHFD(name) \ SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, uint8_t) \ @@ -83,8 +84,8 @@ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Floor) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Ceil) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Reciprocal) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sqrt) -SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(Log) -SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDB(Exp) +SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Log) +SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Exp) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Erf) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Round) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sin) From 55c19d6ab5d49a95f58e66baccdb5f81092e4eeb Mon Sep 17 00:00:00 2001 From: Hector Li Date: Wed, 8 Nov 2023 20:56:36 -0800 Subject: [PATCH 13/13] [QNN EP] Enable option to set QNN context priority (#18315) Enable option qnn_context_priority to set QNN context priority, options: "low", "normal", "normal_high", "high". ### Description Enable option qnn_context_priority to set QNN context priority, options: "low", "normal", "normal_high", "high". This feature guarantees the model inference with higher priority. Tested with onnxruntime_perf_test tool using same model. 1. Run the model on the NPU with single instance, the latency is 300ms. 2. Run the same model on NPU with 2 instance at same time. Case 1: both with same priority (high ) -- latency is 600ms Case 2: 1 with low priority -- latency is 30,000ms 1 with high priority -- latency is 300ms Case 3: 1 with normal priority -- latency is 15,000ms 1 with high priority -- latency is 300ms --- .../core/session/onnxruntime_c_api.h | 1 + .../qnn/builder/qnn_backend_manager.cc | 42 +++++++++++- .../qnn/builder/qnn_backend_manager.h | 4 +- .../core/providers/qnn/builder/qnn_def.h | 8 +++ .../providers/qnn/qnn_execution_provider.cc | 66 +++++++++++++------ .../providers/qnn/qnn_execution_provider.h | 3 +- onnxruntime/test/onnx/main.cc | 8 ++- .../test/perftest/command_args_parser.cc | 5 +- onnxruntime/test/perftest/ort_test_session.cc | 7 +- .../test/providers/qnn/qnn_basic_test.cc | 16 ++++- 10 files changed, 131 insertions(+), 29 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 1d02b72342722..c7d4a236bcf89 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3604,6 +3604,7 @@ struct OrtApi { * "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will * dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and * may alter model/EP partitioning. Use only for debugging. + * "qnn_context_priority": QNN context priority, options: "low", "normal", "normal_high", "high". Default to "normal". * "htp_graph_finalization_optimization_mode": Set the optimization mode for graph finalization on the HTP backend. Available options: * - "0": Default. * - "1": Faster preparation time, less optimal graph. diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index dd56731ac9f7f..03d6b46c528c3 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -380,15 +380,48 @@ Status QnnBackendManager::ReleaseProfilehandle() { return Status::OK(); } +Status SetQnnContextConfig(ContextPriority context_priority, QnnContext_Config_t& qnn_context_config) { + qnn_context_config.option = QNN_CONTEXT_CONFIG_OPTION_PRIORITY; + switch (context_priority) { + case ContextPriority::LOW: { + qnn_context_config.priority = QNN_PRIORITY_LOW; + break; + } + case ContextPriority::NORMAL: { + qnn_context_config.priority = QNN_PRIORITY_NORMAL; + break; + } + case ContextPriority::NORMAL_HIGH: { + qnn_context_config.priority = QNN_PRIORITY_NORMAL_HIGH; + break; + } + case ContextPriority::HIGH: { + qnn_context_config.priority = QNN_PRIORITY_HIGH; + break; + } + case ContextPriority::UNDEFINED: { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid Qnn context priority."); + } + default: + qnn_context_config.priority = QNN_PRIORITY_NORMAL; + } // switch + + return Status::OK(); +} + Status QnnBackendManager::CreateContext() { if (true == context_created_) { LOGS_DEFAULT(INFO) << "Context created already."; return Status::OK(); } + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, qnn_context_config)); + const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr}; + auto result = qnn_interface_.contextCreate(backend_handle_, device_handle_, - (const QnnContext_Config_t**)&context_config_, + context_configs, &context_); ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context."); @@ -486,9 +519,14 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t ORT_RETURN_IF(nullptr == qnn_interface_.contextCreateFromBinary, "Invalid function pointer for contextCreateFromBinary."); + + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, qnn_context_config)); + const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr}; + rt = qnn_interface_.contextCreateFromBinary(backend_handle_, device_handle_, - (const QnnContext_Config_t**)&context_config_, + context_configs, static_cast(buffer), buffer_length, &context_, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index de5ccb5a28389..aac82c89d6f49 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -30,11 +30,13 @@ class QnnBackendManager { ProfilingLevel profiling_level, uint32_t rpc_control_latency, HtpPerformanceMode htp_performance_mode, + ContextPriority context_priority, std::string&& qnn_saver_path) : backend_path_(backend_path), profiling_level_(profiling_level), rpc_control_latency_(rpc_control_latency), htp_performance_mode_(htp_performance_mode), + context_priority_(context_priority), qnn_saver_path_(qnn_saver_path) { } ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnBackendManager); @@ -186,7 +188,6 @@ class QnnBackendManager { Qnn_LogHandle_t log_handle_ = nullptr; Qnn_DeviceHandle_t device_handle_ = nullptr; Qnn_ContextHandle_t context_ = nullptr; - QnnContext_Config_t** context_config_ = nullptr; ProfilingLevel profiling_level_; bool backend_initialized_ = false; bool device_created_ = false; @@ -198,6 +199,7 @@ class QnnBackendManager { std::vector op_package_paths_; uint32_t rpc_control_latency_ = 0; HtpPerformanceMode htp_performance_mode_; + ContextPriority context_priority_; std::string sdk_build_version_ = ""; #ifdef _WIN32 std::set mod_handles_; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h index 6080c63b555a8..66154fcf346ee 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_def.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h @@ -48,6 +48,14 @@ enum class HtpPerformanceMode : uint8_t { kHtpBalanced, }; +enum class ContextPriority : uint8_t { + LOW = 0, + NORMAL, + NORMAL_HIGH, + HIGH, + UNDEFINED +}; + // Defines the graph optimization strategy used by the HTP backend. enum class HtpGraphFinalizationOptimizationMode : uint8_t { kDefault = 0, diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 6cb276378a09c..8acd0d68b71d0 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -76,6 +76,26 @@ void QNNExecutionProvider::ParseHtpPerformanceMode(std::string htp_performance_m } } +void QNNExecutionProvider::ParseQnnContextPriority(std::string context_priority_string) { + std::transform(context_priority_string.begin(), + context_priority_string.end(), + context_priority_string.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + LOGS_DEFAULT(VERBOSE) << "QNN context priority: " << context_priority_string; + if (context_priority_string == "low") { + context_priority_ = qnn::ContextPriority::LOW; + } else if (context_priority_string == "normal") { + context_priority_ = qnn::ContextPriority::NORMAL; + } else if (context_priority_string == "normal_high") { + context_priority_ = qnn::ContextPriority::NORMAL_HIGH; + } else if (context_priority_string == "high") { + context_priority_ = qnn::ContextPriority::HIGH; + } else { + context_priority_ = qnn::ContextPriority::UNDEFINED; + LOGS_DEFAULT(WARNING) << "QNN context priority: " << context_priority_string << " not valid, set to undefined."; + } +} + void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string) { LOGS_DEFAULT(VERBOSE) << "HTP graph finalization optimization mode: " << htp_graph_finalization_opt_mode_string; @@ -96,16 +116,15 @@ void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std:: QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options) - : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true}, - runtime_options_(provider_options_map) { + : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true} { if (session_options) { disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault( kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; } static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable"; - auto context_cache_enabled_pos = runtime_options_.find(CONTEXT_CACHE_ENABLED); - if (context_cache_enabled_pos != runtime_options_.end()) { + auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED); + if (context_cache_enabled_pos != provider_options_map.end()) { if (context_cache_enabled_pos->second == "1") { context_cache_enabled_ = true; LOGS_DEFAULT(VERBOSE) << "Context cache enabled."; @@ -113,25 +132,25 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path"; - auto context_cache_path_pos = runtime_options_.find(CONTEXT_CACHE_PATH); - if (context_cache_path_pos != runtime_options_.end()) { + auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH); + if (context_cache_path_pos != provider_options_map.end()) { context_cache_path_ = context_cache_path_pos->second; LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_; } bool qnn_context_embed_mode = true; static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode"; - auto context_cache_embed_mode_pos = runtime_options_.find(CONTEXT_CACHE_EMBED_MODE); - if (context_cache_embed_mode_pos != runtime_options_.end()) { + auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE); + if (context_cache_embed_mode_pos != provider_options_map.end()) { qnn_context_embed_mode = context_cache_embed_mode_pos->second == "1"; LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode; } static const std::string BACKEND_PATH = "backend_path"; - auto backend_path_pos = runtime_options_.find(BACKEND_PATH); + auto backend_path_pos = provider_options_map.find(BACKEND_PATH); std::string backend_path; - if (backend_path_pos != runtime_options_.end()) { + if (backend_path_pos != provider_options_map.end()) { backend_path = backend_path_pos->second; LOGS_DEFAULT(VERBOSE) << "Backend path: " << backend_path; } else { @@ -139,46 +158,53 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } static const std::string PROFILING_LEVEL = "profiling_level"; - auto profiling_level_pos = runtime_options_.find(PROFILING_LEVEL); - if (profiling_level_pos != runtime_options_.end()) { + auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL); + if (profiling_level_pos != provider_options_map.end()) { ParseProfilingLevel(profiling_level_pos->second); } static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency"; - auto latency_pos = runtime_options_.find(RPC_CONTROL_LANTENCY); - if (latency_pos != runtime_options_.end()) { + auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY); + if (latency_pos != provider_options_map.end()) { rpc_control_latency_ = static_cast(std::stoul(latency_pos->second)); LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency_; } htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode"; - auto htp_performance_mode_pos = runtime_options_.find(HTP_PERFORMANCE_MODE); - if (htp_performance_mode_pos != runtime_options_.end()) { + auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE); + if (htp_performance_mode_pos != provider_options_map.end()) { ParseHtpPerformanceMode(htp_performance_mode_pos->second); } htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; static const std::string HTP_GRAPH_FINALIZATION_OPT_MODE = "htp_graph_finalization_optimization_mode"; - auto htp_graph_finalization_opt_mode_pos = runtime_options_.find(HTP_GRAPH_FINALIZATION_OPT_MODE); - if (htp_graph_finalization_opt_mode_pos != runtime_options_.end()) { + auto htp_graph_finalization_opt_mode_pos = provider_options_map.find(HTP_GRAPH_FINALIZATION_OPT_MODE); + if (htp_graph_finalization_opt_mode_pos != provider_options_map.end()) { ParseHtpGraphFinalizationOptimizationMode(htp_graph_finalization_opt_mode_pos->second); } // Enable use of QNN Saver if the user provides a path the QNN Saver backend library. static const std::string QNN_SAVER_PATH_KEY = "qnn_saver_path"; std::string qnn_saver_path; - auto qnn_saver_path_pos = runtime_options_.find(QNN_SAVER_PATH_KEY); - if (qnn_saver_path_pos != runtime_options_.end()) { + auto qnn_saver_path_pos = provider_options_map.find(QNN_SAVER_PATH_KEY); + if (qnn_saver_path_pos != provider_options_map.end()) { qnn_saver_path = qnn_saver_path_pos->second; LOGS_DEFAULT(VERBOSE) << "User specified QNN Saver path: " << qnn_saver_path; } + static const std::string QNN_CONTEXT_PRIORITY = "qnn_context_priority"; + auto qnn_context_priority_pos = provider_options_map.find(QNN_CONTEXT_PRIORITY); + if (qnn_context_priority_pos != provider_options_map.end()) { + ParseQnnContextPriority(qnn_context_priority_pos->second); + } + qnn_backend_manager_ = std::make_unique( std::move(backend_path), profiling_level_, rpc_control_latency_, htp_performance_mode_, + context_priority_, std::move(qnn_saver_path)); qnn_cache_model_handler_ = std::make_unique(qnn_context_embed_mode); } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index a01b828531555..cf0bff8890d0c 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -57,13 +57,13 @@ class QNNExecutionProvider : public IExecutionProvider { const logging::Logger& logger); void ParseHtpPerformanceMode(std::string htp_performance_mode_string); + void ParseQnnContextPriority(std::string context_priority_string); void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string); void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const; private: - ProviderOptions runtime_options_; qnn::ProfilingLevel profiling_level_ = qnn::ProfilingLevel::OFF; qnn::HtpPerformanceMode htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; @@ -74,6 +74,7 @@ class QNNExecutionProvider : public IExecutionProvider { std::string context_cache_path_ = ""; bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session. std::unique_ptr qnn_cache_model_handler_; + qnn::ContextPriority context_priority_ = qnn::ContextPriority::NORMAL; }; } // namespace onnxruntime diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 98646058eec3d..2c0804397cfe8 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -56,6 +56,7 @@ void usage() { "\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n" "\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n" "\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n" + "\t [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n" "\t [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n" "\t 0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n" "\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n" @@ -488,6 +489,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) { std::string str = str_stream.str(); ORT_THROW("Wrong value for htp_performance_mode. select from: " + str); } + } else if (key == "qnn_context_priority") { + std::set supported_qnn_context_priority = {"low", "normal", "normal_high", "high"}; + if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) { + ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high"); + } } else if (key == "qnn_saver_path") { // no validation } else if (key == "htp_graph_finalization_optimization_mode") { @@ -502,7 +508,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { } else { ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', 'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path', -'htp_graph_finalization_optimization_mode'])"); +'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])"); } qnn_options[key] = value; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 72472e5798792..a72a0d105eefc 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -34,8 +34,8 @@ namespace perftest { "\t-A: Disable memory arena\n" "\t-I: Generate tensor input binding (Free dimensions are treated as 1.)\n" "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n" - "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|snpe|rocm|migraphx|xnnpack|vitisai]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', " - "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'snpe', 'rocm', 'migraphx', 'xnnpack' or 'vitisai'. " + "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', " + "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack' or 'vitisai'. " "Default:'cpu'.\n" "\t-b [tf|ort]: backend to use. Default:ort\n" "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n" @@ -71,6 +71,7 @@ namespace perftest { "\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n" "\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n" "\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n" + "\t [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n" "\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n" "\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n" "\t '0', '1', '2', '3', default is '0'.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index f3ea188043dbe..c2dd81ec9f359 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -367,10 +367,15 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device std::string str = str_stream.str(); ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); } + } else if (key == "qnn_context_priority") { + std::set supported_qnn_context_priority = {"low", "normal", "normal_high", "high"}; + if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) { + ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high"); + } } else { ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', 'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'htp_performance_mode', 'qnn_saver_path', -'htp_graph_finalization_optimization_mode'])"); +'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])"); } qnn_options[key] = value; diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 02ff834169b2b..2e2acb36e8071 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -174,7 +174,8 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) { // shape inferencing issues on QNN. Thus, the models are expected to have a specific input/output // types and shapes. static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false, - std::string htp_graph_finalization_opt_mode = "") { + std::string htp_graph_finalization_opt_mode = "", + std::string qnn_context_priority = "") { Ort::SessionOptions so; // Ensure all type/shape inference warnings result in errors! @@ -199,6 +200,10 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo options["htp_graph_finalization_optimization_mode"] = std::move(htp_graph_finalization_opt_mode); } + if (!qnn_context_priority.empty()) { + options["qnn_context_priority"] = std::move(qnn_context_priority); + } + so.AppendExecutionProvider("QNN", options); Ort::Session session(*ort_env, ort_model_path, so); @@ -322,6 +327,15 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) { } } +// Test that models run with high QNN context priority. +TEST_F(QnnHTPBackendTests, QnnContextPriorityHigh) { + RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx", + true, // use_htp + false, // enable_qnn_saver + "", // htp_graph_finalization_opt_mode + "high"); // qnn_context_priority +} + #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) #endif // !defined(ORT_MINIMAL_BUILD)