From 09c98433e7e6b352c7b878660267d2635c7bfcf6 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 14 Nov 2024 00:09:07 -0800 Subject: [PATCH] [CUDA] stable diffusion benchmark allows IO binding for optimum (#22834) ### Description Update stable diffusion benchmark: (1) allow IO binding for optimum. (2) do not use num_images_per_prompt across all engines for fair comparison. Example to run benchmark of optimum on stable diffusion 1.5: ``` git clone https://github.com/tianleiwu/optimum cd optimum git checkout tlwu/diffusers-io-binding pip install -e . pip install -U onnxruntime-gpu git clone https://github.com/microsoft/onnxruntime cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion git checkout tlwu/benchmark_sd_optimum_io_binding pip install -r requirements/cuda12/requirements.txt optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 --task text-to-image ./sd_onnx_fp32 python optimize_pipeline.py -i ./sd_onnx_fp32 -o ./sd_onnx_fp16 --float16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 --use_io_binding ``` Example output in H100_80GB_HBM3: 572 ms with IO Binding; 588 ms without IO Binding; IO binding gains 16ms, or 2.7%, ### Motivation and Context Optimum is working on enabling I/O binding: https://github.com/huggingface/optimum/pull/2056. This could help testing the impact of I/O binding on the performance of the stable diffusion. --- .../models/stable_diffusion/benchmark.py | 301 ++++++++++-------- 1 file changed, 169 insertions(+), 132 deletions(-) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 3879e25386d53..0708d57f040f8 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -51,6 +51,10 @@ def example_prompts(): return prompts, negative_prompt +def warmup_prompts(): + return "warm up", "bad" + + def measure_gpu_memory(monitor_type, func, start_memory=None): return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory) @@ -136,7 +140,14 @@ def run_ort_pipeline( prompts, negative_prompt = example_prompts() def warmup(): - pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size) + prompt, negative = warmup_prompts() + pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative] * batch_size, + ) # Run warm up, and measure GPU memory of two runs # cuDNN/MIOpen The first run has algo search so it might need more memory) @@ -149,22 +160,20 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() - images = pipe( - [prompt] * batch_size, - height, - width, - num_inference_steps=steps, - negative_prompt=[negative_prompt] * batch_size, - guidance_scale=7.5, - ).images - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"Inference took {latency:.3f} seconds") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") + inference_start = time.time() + images = pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative_prompt] * batch_size, + ).images + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"Inference took {latency:.3f} seconds") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.jpg") from onnxruntime import __version__ as ort_version @@ -200,7 +209,14 @@ def run_torch_pipeline( # total 2 runs of warm up, and measure GPU memory for CUDA EP def warmup(): - pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size) + prompt, negative = warmup_prompts() + pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative] * batch_size, + ) # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory) first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) @@ -215,25 +231,23 @@ def warmup(): if i >= num_prompts: break torch.cuda.synchronize() - for j in range(batch_count): - inference_start = time.time() - images = pipe( - prompt=[prompt] * batch_size, - height=height, - width=width, - num_inference_steps=steps, - guidance_scale=7.5, - negative_prompt=[negative_prompt] * batch_size, - generator=None, # torch.Generator - ).images + inference_start = time.time() + images = pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative_prompt] * batch_size, + generator=None, # torch.Generator + ).images - torch.cuda.synchronize() - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"Inference took {latency:.3f} seconds") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") + torch.cuda.synchronize() + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"Inference took {latency:.3f} seconds") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.jpg") return { "engine": "torch", @@ -306,6 +320,7 @@ def get_optimum_ort_pipeline( directory: str, provider="CUDAExecutionProvider", disable_safety_checker: bool = True, + use_io_binding: bool = False, ): from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline @@ -321,7 +336,7 @@ def get_optimum_ort_pipeline( pipeline = ORTStableDiffusionPipeline.from_pretrained( directory, provider=provider, - use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification. + use_io_binding=use_io_binding, ) elif "xl" in model_name: pipeline = ORTStableDiffusionXLPipeline.from_pretrained( @@ -337,7 +352,7 @@ def get_optimum_ort_pipeline( model_name, export=True, provider=provider, - use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification. + use_io_binding=use_io_binding, ) pipeline.save_pretrained(directory) @@ -359,15 +374,33 @@ def run_optimum_ort_pipeline( batch_count, start_memory, memory_monitor_type, + use_num_images_per_prompt=False, ): from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline)) - prompts = example_prompts() + prompts, negative_prompt = example_prompts() def warmup(): - pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size) + prompt, negative = warmup_prompts() + if use_num_images_per_prompt: + pipe( + prompt=prompt, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=negative, + num_images_per_prompt=batch_count, + ) + else: + pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative] * batch_size, + ) # Run warm up, and measure GPU memory of two runs. # The first run has algo search for cuDNN/MIOpen, so it might need more memory. @@ -380,23 +413,30 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() + inference_start = time.time() + if use_num_images_per_prompt: images = pipe( - prompt, - height, - width, + prompt=prompt, + height=height, + width=width, num_inference_steps=steps, - negative_prompt=None, - guidance_scale=0.0, # 7.5 + negative_prompt=negative_prompt, num_images_per_prompt=batch_size, ).images - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"Inference took {latency:.3f} seconds") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") + else: + images = pipe( + prompt=[prompt] * batch_size, + height=height, + width=width, + num_inference_steps=steps, + negative_prompt=[negative_prompt] * batch_size, + ).images + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"Inference took {latency:.3f} seconds") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.jpg") from onnxruntime import __version__ as ort_version @@ -429,9 +469,12 @@ def run_optimum_ort( batch_count: int, start_memory, memory_monitor_type, + use_io_binding: bool = False, ): load_start = time.time() - pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker) + pipe = get_optimum_ort_pipeline( + model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding + ) load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") @@ -530,9 +573,8 @@ def run_ort_trt_static( pipeline.load_resources(height, width, batch_size) def warmup(): - pipeline.run( - ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True - ) + prompt, negative = warmup_prompts() + pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -548,24 +590,23 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() - # Use warmup mode here since non-warmup mode will save image to disk. - images, pipeline_time = pipeline.run( - [prompt] * batch_size, - [negative_prompt] * batch_size, - height, - width, - denoising_steps=steps, - guidance=7.5, - seed=123, - ) - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") + inference_start = time.time() + # Use warmup mode here since non-warmup mode will save image to disk. + images, pipeline_time = pipeline.run( + [prompt] * batch_size, + [negative_prompt] * batch_size, + height, + width, + denoising_steps=steps, + guidance=7.5, + seed=123, + ) + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.jpg") pipeline.teardown() @@ -671,9 +712,8 @@ def run_tensorrt_static( pipeline.load_resources(height, width, batch_size) def warmup(): - pipeline.run( - ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True - ) + prompt, negative = warmup_prompts() + pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -689,24 +729,22 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() - # Use warmup mode here since non-warmup mode will save image to disk. - images, pipeline_time = pipeline.run( - [prompt] * batch_size, - [negative_prompt] * batch_size, - height, - width, - denoising_steps=steps, - guidance=7.5, - seed=123, - ) - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") + inference_start = time.time() + # Use warmup mode here since non-warmup mode will save image to disk. + images, pipeline_time = pipeline.run( + [prompt] * batch_size, + [negative_prompt] * batch_size, + height, + width, + denoising_steps=steps, + seed=123, + ) + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.jpg") pipeline.teardown() @@ -828,7 +866,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None): ) def warmup(): - run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size) + prompt, negative = warmup_prompts() + run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -845,20 +884,15 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() - # Use warmup mode here since non-warmup mode will save image to disk. - if nvtx_profile: - cudart.cudaProfilerStart() - images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) - if nvtx_profile: - cudart.cudaProfilerStop() - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png") + inference_start = time.time() + # Use warmup mode here since non-warmup mode will save image to disk. + images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") + for k, image in enumerate(images): + image.save(f"{image_filename_prefix}_{i}_{k}.png") pipeline.teardown() @@ -911,8 +945,6 @@ def run_ort_trt_xl( opt_batch_size=batch_size, ) - from cuda import cudart - assert batch_size <= max_batch_size pipeline.load_resources(height, width, batch_size) @@ -929,7 +961,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None): ) def warmup(): - run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size) + prompt, negative = warmup_prompts() + run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -946,22 +979,17 @@ def warmup(): for i, prompt in enumerate(prompts): if i >= num_prompts: break - for j in range(batch_count): - inference_start = time.time() - # Use warmup mode here since non-warmup mode will save image to disk. - if nvtx_profile: - cudart.cudaProfilerStart() - images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) - if nvtx_profile: - cudart.cudaProfilerStop() - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") - for k, image in enumerate(images): - filename = f"{image_filename_prefix}_{i}_{j}_{k}.png" - image.save(filename) - print("Image saved to", filename) + inference_start = time.time() + # Use warmup mode here since non-warmup mode will save image to disk. + images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) + inference_end = time.time() + latency = inference_end - inference_start + latency_list.append(latency) + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") + for k, image in enumerate(images): + filename = f"{image_filename_prefix}_{i}_{k}.png" + image.save(filename) + print("Image saved to", filename) pipeline.teardown() @@ -1137,6 +1165,14 @@ def parse_arguments(): ) parser.set_defaults(use_xformers=False) + parser.add_argument( + "--use_io_binding", + required=False, + action="store_true", + help="Use I/O Binding for Optimum.", + ) + parser.set_defaults(use_io_binding=False) + parser.add_argument( "-b", "--batch_size", @@ -1176,8 +1212,8 @@ def parse_arguments(): "--num_prompts", required=False, type=int, - default=1, - help="Number of prompts. Default is 1.", + default=10, + help="Number of prompts. Default is 10.", ) parser.add_argument( @@ -1312,6 +1348,7 @@ def main(): batch_count=args.batch_count, start_memory=start_memory, memory_monitor_type=memory_monitor_type, + use_io_binding=args.use_io_binding, ) elif args.engine == "onnxruntime": assert args.pipeline and os.path.isdir(