From 8044e5f60332940217576a067d99b454a83c5457 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 8 Nov 2023 00:42:55 -0800 Subject: [PATCH] SDXL: Update demo with dynamic shape serving with CUDA EP (#18340) Update the SDXL demo with dynamic shape serving with CUDA EP. --- .../stable_diffusion/demo_txt2img_xl.py | 103 ++++++++++++++---- .../models/stable_diffusion/demo_utils.py | 2 +- .../stable_diffusion/diffusion_models.py | 23 +++- .../pipeline_stable_diffusion.py | 9 +- 4 files changed, 107 insertions(+), 30 deletions(-) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py index 16e776a08282c..0b529875a2fe7 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py @@ -29,17 +29,7 @@ from pipeline_txt2img_xl import Txt2ImgXLPipeline -def run_demo(): - """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image.""" - - args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo") - - prompt, negative_prompt = repeat_prompt(args) - - # Recommend image size as one of those used in training (see Appendix I in https://arxiv.org/pdf/2307.01952.pdf). - image_height = args.height - image_width = args.width - +def load_pipelines(args, batch_size): # Register TensorRT plugins engine_type = get_engine_type(args.engine) if engine_type == EngineType.TRT: @@ -49,19 +39,18 @@ def run_demo(): max_batch_size = 16 if (engine_type in [EngineType.ORT_TRT, EngineType.TRT]) and ( - args.build_dynamic_shape or image_height > 512 or image_width > 512 + args.build_dynamic_shape or args.height > 512 or args.width > 512 ): max_batch_size = 4 - batch_size = len(prompt) if batch_size > max_batch_size: raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.") # No VAE decoder in base when it outputs latent instead of image. - base_info = PipelineInfo(args.version, use_vae=False) + base_info = PipelineInfo(args.version, use_vae=False, min_image_size=640, max_image_size=1536) base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size) - refiner_info = PipelineInfo(args.version, is_refiner=True) + refiner_info = PipelineInfo(args.version, is_refiner=True, min_image_size=640, max_image_size=1536) refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size) if engine_type == EngineType.TRT: @@ -77,7 +66,13 @@ def run_demo(): enable_vae_slicing = True if enable_vae_slicing: refiner.backend.enable_vae_slicing() + return base, refiner + +def run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False): + image_height = args.height + image_width = args.width + batch_size = len(prompt) base.load_resources(image_height, image_width, batch_size) refiner.load_resources(image_height, image_width, batch_size) @@ -112,10 +107,14 @@ def run_base_and_refiner(warmup=False): # inference once to get cuda graph _, _ = run_base_and_refiner(warmup=True) - print("[I] Warming up ..") + if args.num_warmup_runs > 0: + print("[I] Warming up ..") for _ in range(args.num_warmup_runs): _, _ = run_base_and_refiner(warmup=True) + if is_warm_up: + return + print("[I] Running StableDiffusion XL pipeline") if args.nvtx_profile: cudart.cudaProfilerStart() @@ -123,14 +122,80 @@ def run_base_and_refiner(warmup=False): if args.nvtx_profile: cudart.cudaProfilerStop() - base.teardown() - print("|------------|--------------|") print("| {:^10} | {:>9.2f} ms |".format("e2e", latency)) print("|------------|--------------|") + + +def run_demo(args): + """Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image.""" + + prompt, negative_prompt = repeat_prompt(args) + batch_size = len(prompt) + base, refiner = load_pipelines(args, batch_size) + run_pipelines(args, base, refiner, prompt, negative_prompt) + base.teardown() + refiner.teardown() + + +def run_dynamic_shape_demo(args): + """Run demo of generating images with different size with list of prompts with ORT CUDA provider.""" + args.engine = "ORT_CUDA" + args.scheduler = "UniPC" + args.denoising_steps = 8 + args.disable_cuda_graph = True + + batch_size = args.repeat_prompt + base, refiner = load_pipelines(args, batch_size) + + image_sizes = [ + (1024, 1024), + (1152, 896), + (896, 1152), + (1216, 832), + (832, 1216), + (1344, 768), + (768, 1344), + (1536, 640), + (640, 1536), + ] + + # Warm up the pipelines. This only need once before serving. + args.prompt = ["warm up"] + args.num_warmup_runs = 3 + prompt, negative_prompt = repeat_prompt(args) + for height, width in image_sizes: + args.height = height + args.width = width + print(f"\nWarm up pipelines for Batch_size={batch_size}, Height={height}, Width={width}") + run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True) + + # Run pipeline on a list of prompts. + prompts = [ + "starry night over Golden Gate Bridge by van gogh", + "little cute gremlin sitting on a bed, cinematic", + ] + args.num_warmup_runs = 0 + for example_prompt in prompts: + args.prompt = [example_prompt] + prompt, negative_prompt = repeat_prompt(args) + + for height, width in image_sizes: + args.height = height + args.width = width + print(f"\nBatch_size={batch_size}, Height={height}, Width={width}, Prompt={example_prompt}") + run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False) + + base.teardown() refiner.teardown() if __name__ == "__main__": coloredlogs.install(fmt="%(funcName)20s: %(message)s") - run_demo() + + args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo") + no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0] + if no_prompt: + run_dynamic_shape_demo(args) + else: + run_demo(args) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py index e65efd2c53839..d0e4e3adefbc3 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py @@ -78,7 +78,7 @@ def parse_arguments(is_xl: bool, description: str): help="Root Directory to store torch or ONNX models, built engines and output images etc.", ) - parser.add_argument("prompt", nargs="+", help="Text prompt(s) to guide image generation.") + parser.add_argument("prompt", nargs="*", default=[""], help="Text prompt(s) to guide image generation.") parser.add_argument( "--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation." diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py index 4a2e9eb3443da..d93ca8dba7fa0 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py @@ -82,12 +82,21 @@ def infer_shapes(self): class PipelineInfo: - def __init__(self, version: str, is_inpaint: bool = False, is_refiner: bool = False, use_vae=False): + def __init__( + self, + version: str, + is_inpaint: bool = False, + is_refiner: bool = False, + use_vae=False, + min_image_size=256, + max_image_size=1024, + ): self.version = version self._is_inpaint = is_inpaint self._is_refiner = is_refiner self._use_vae = use_vae - + self._min_image_size = min_image_size + self._max_image_size = max_image_size if is_refiner: assert self.is_xl() @@ -187,6 +196,12 @@ def unet_embedding_dim(self): else: raise ValueError(f"Invalid version {self.version}") + def min_image_size(self): + return self._min_image_size + + def max_image_size(self): + return self._max_image_size + class BaseModel: def __init__( @@ -209,8 +224,8 @@ def __init__( self.min_batch = 1 self.max_batch = max_batch_size - self.min_image_shape = 256 # min image resolution: 256x256 - self.max_image_shape = 1024 # max image resolution: 1024x1024 + self.min_image_shape = pipeline_info.min_image_size() + self.max_image_shape = pipeline_info.max_image_size() self.min_latent_shape = self.min_image_shape // 8 self.max_latent_shape = self.max_image_shape // 8 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index e28db2b77105a..e34fab1218b21 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -163,12 +163,9 @@ def is_backend_tensorrt(self): return self.engine_type == EngineType.TRT def set_denoising_steps(self, denoising_steps: int): - if self.denoising_steps != denoising_steps: - assert self.denoising_steps is None # TODO(tianleiwu): support changing steps in different runs - # Pre-compute latent input scales and linear multistep coefficients - self.scheduler.set_timesteps(denoising_steps) - self.scheduler.configure() - self.denoising_steps = denoising_steps + self.scheduler.set_timesteps(denoising_steps) + self.scheduler.configure() + self.denoising_steps = denoising_steps def load_resources(self, image_height, image_width, batch_size): # If engine is built with static input shape, call this only once after engine build.