diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index 5927a469ca3e4..b10c10c87ee57 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -21,7 +21,7 @@ These optimizations are firstly carried out on CUDA EP. They may not work on oth | [demo_txt2img.py](./demo_txt2img.py) | Demo of text to image generation using Stable Diffusion models except XL. | | [optimize_pipeline.py](./optimize_pipeline.py) | Optimize Stable Diffusion ONNX models exported from Huggingface diffusers or optimum | | [benchmark.py](./benchmark.py) | Benchmark latency and memory of OnnxRuntime, xFormers or PyTorch 2.0 on stable diffusion. | -| [benchmark_turbo.py](./benchmark_controlnet.py)| Benchmark latency of PyTorch or Stable-Fast with canny control net. | +| [benchmark_controlnet.py](./benchmark_controlnet.py)| Benchmark latency of canny control net. | ## Run demo with docker @@ -379,97 +379,6 @@ Common settings for below test results: | ------------------------------ | ---------------------- | ------ | ----- | ----- | ----------- | ----------- | | runwayml/stable-diffusion-v1-5 | TRUE | 512 | 512 | 50 | 5 | 1 | -#### Results of RTX 3060 (Windows 11) - -| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | -| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | 1.14.1 | CUDA | 1 | 4.8 | 4,117 | 4,625 | -| torch | 2.0.0+cu117 | default | 1 | 5.6 | 4,325 | 4,047 | -| torch | 1.13.1+cu117 | xformers | 1 | 6.0 | 9,124 | 9,130 | -| onnxruntime | 1.14.1 | CUDA | 4 | 17.7 | 6,659 | 6,659 | -| torch | 2.0.0+cu117 | default | 4 | 20.1 | 6,421 | 6,907 | -| torch | 1.13.1+cu117 | xformers | 4 | 21.6 | 10,407 | 10,409 | -| onnxruntime | 1.14.1 | CUDA | 8 | 33.5 | 6,663 | 6,663 | -| torch | 2.0.0+cu117 | default | 8 | 39.5 | 10,767 | 10,813 | -| torch | 1.13.1+cu117 | xformers | 8 | 41.1 | 10,825 | 9,255 | - - -#### Results of A100-SXM4-40GB (Ubuntu 20.04) -| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | -| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | 1.14.1 | CUDA | 1 | 1.1 | 6,883 | 7,395 | -| torch | 2.0.0+cu117 | default | 1 | 1.5 | 13,828 | 4,400 | -| torch | 2.0.0+cu117 | compile | 1 | 1.8 | 13,892 | 4,386 | -| onnxruntime | 1.14.1 | CUDA | 4 | 3.7 | 7,381 | 7,381 | -| torch | 2.0.0+cu117 | default | 4 | 3.9 | 31,278 | 6,870 | -| torch | 2.0.0+cu117 | compile | 4 | 3.4 | 31,364 | 6,880 | -| onnxruntime | 1.14.1 | CUDA | 8 | 6.9 | 7,411 | 7,411 | -| torch | 2.0.0+cu117 | default | 8 | 7.6 | 31,660 | 10,122 | -| torch | 2.0.0+cu117 | compile | 8 | 6.5 | 31,800 | 10,308 | -| onnxruntime | 1.14.1 | CUDA | 16 | 13.6 | 11,479 | 11,479 | -| torch | 2.0.0+cu117 | default | 16 | 14.8 | 32,306 | 16,520 | -| torch | 2.0.0+cu117 | compile | 16 | 12.6 | 32,636 | 16,898 | - -#### Results of A100-PCIE-80GB (Ubuntu 20.04) -| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | -| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| tensorrt | 8.6.1 | default | 1 | 1.00 | 9,056 | 9,056 | -| onnxruntime | 1.16.0 nightly | tensorrt | 1 | 1.09 | 11,250 | 11,250 | -| onnxruntime | 1.16.0 nightly | tensorrt (cuda graph) | 1 | 0.96 | 11,382 | 11,382 | -| onnxruntime | 1.16.0 nightly | cuda | 1 | 1.11 | 4,760 | 5,144 | -| onnxruntime | 1.16.0 nightly | cuda (cuda graph) | 1 | 1.04 | 5,230 | 5,390 | -| tensorrt | 8.6.1 | default | 4 | 3.39 | 9,072 | 9,072 | -| onnxruntime | 1.16.0 nightly | tensorrt | 4 | 3.60 | 11,266 | 11,266 | -| onnxruntime | 1.16.0 nightly | tensorrt (cuda graph) | 4 | 3.43 | 11,428 | 11,428 | - -#### Results of V100-PCIE-16GB (Ubuntu 20.04) - -Results from Standard_NC6s_v3 Azure virtual machine: - -| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | -| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | 1.14.1 | CUDA | 1 | 2.7 | 12,646 | 7,152 | -| torch | 2.0.0+cu117 | compile | 1 | 3.2 | 13,317 | 3,909 | -| torch | 2.0.0+cu117 | default | 1 | 2.7 | 13,343 | 3,921 | -| torch | 1.13.1+cu117 | xformers | 1 | 3.5 | 14,979 | 10,449 | -| onnxruntime | 1.14.1 | CUDA | 4 | 8.4 | 7,114 | 7,114 | -| torch | 2.0.0+cu117 | compile | 4 | 8.0 | 13,897 | 6,821 | -| torch | 2.0.0+cu117 | default | 4 | 8.7 | 13,873 | 6,607 | -| torch | 1.13.1+cu117 | xformers | 4 | 9.1 | 12,969 | 8,421 | -| onnxruntime | 1.14.1 | CUDA | 8 | 15.9 | 7,120 | 7,120 | -| torch | 2.0.0+cu117 | compile | 8 | 15.5 | 14,669 | 10,355 | -| torch | 2.0.0+cu117 | default | 8 | 17.0 | 14,469 | 9,657 | -| torch | 1.13.1+cu117 | xformers | 8 | 17.4 | 15,593 | 9,133 | - -#### Results of T4 (Ubuntu 20.04) - -To make the result stable, we lock the frequency of T4 GPU like -`sudo nvidia-smi --lock-gpu-clocks=990` for fair comparison. See [nvidia blog](https://developer.nvidia.com/blog/advanced-api-performance-setstablepowerstate/) for more information. Note that performance might be slightly better without locking frequency. - -Results are from Standard_NC4as_T4_v3 Azure virtual machine: - -| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | -| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | 1.14.1 | CUDA | 1 | 5.6 | 4,925 | 4,925 | -| onnxruntime | 1.15.1 | CUDA | 1 | 5.5 | 3,738 | 4,250 | -| onnxruntime | 1.15.1 (tensorrt 8.6.1) | Tensorrt | 1 | 4.8 | 10,710 | 10,710 | -| onnxruntime | 1.16.0 nightly | Tensorrt (cuda graph) | 1 | 4.7 | 11,746 | 10,746 | -| tensorrt | 8.6.1 | default | 1 | 5.0 | 8,530 | 8,530 | -| torch | 1.13.1+cu117 | xformers | 1 | 6.9 | 14,845 | 10,317 | -| torch | 2.0.0+cu117 | compile | 1 | 6.0 | 12,989 | 3,841 | -| torch | 2.0.0+cu117 | default | 1 | 6.4 | 12,987 | 3,841 | -| onnxruntime | 1.14.1 | CUDA | 4 | 23.0 | 6,977 | 6,977 | -| onnxruntime | 1.15.1 | CUDA | 4 | 22.6 | 6,298 | 6,298 | -| onnxruntime | 1.15.1 (tensorrt 8.6.1) | Tensorrt | 4 | 21.8 | 10,746 | 10,746 | -| tensorrt | 8.6.1 | default | 4 | 22.2 | 8,542 | 8,542 | -| torch | 1.13.1+cu117 | xformers | 4 | 25.8 | 12,819 | 8,269 | -| torch | 2.0.0+cu117 | compile | 4 | 22.2 | 14,637 | 6,583 | -| torch | 2.0.0+cu117 | default | 4 | 25.2 | 14,409 | 6,355 | -| onnxruntime | 1.14.1 | CUDA | 8 | 46.4 | 6,779 | 6,779 | -| torch | 1.13.1+cu117 | xformers | 8 | 51.4 | 14,827 | 9,001 | -| torch | 2.0.0+cu117 | compile | 8 | 46.5 | 12,595 | 10,171 | -| torch | 2.0.0+cu117 | default | 8 | 50.7 | 11,955 | 9,531 | - #### Results of MI250X, 1 GCD (Ubuntu 20.04) | engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 1f1db914e274b..6c337af78e0a9 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -301,67 +301,95 @@ def run_ort( return result -def export_and_run_ort( - version: str, - provider: str, - batch_size: int, - disable_safety_checker: bool, - height: int, - width: int, - steps: int, - num_prompts: int, - batch_count: int, - start_memory, - memory_monitor_type, - enable_cuda_graph: bool, +def get_optimum_ort_pipeline( + model_name: str, + directory: str, + provider="CUDAExecutionProvider", + disable_safety_checker: bool = True, ): - assert provider == "CUDAExecutionProvider" + from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline + + if directory is not None and os.path.exists(directory): + if "xl" in model_name: + pipeline = ORTStableDiffusionXLPipeline.from_pretrained( + directory, + provider=provider, + session_options=None, + use_io_binding=False, + ) + else: + pipeline = ORTStableDiffusionPipeline.from_pretrained( + directory, + provider=provider, + use_io_binding=False, + ) + elif "xl" in model_name: + pipeline = ORTStableDiffusionXLPipeline.from_pretrained( + model_name, + export=True, + provider=provider, + session_options=None, + use_io_binding=False, + ) + pipeline.save_pretrained(directory) + else: + pipeline = ORTStableDiffusionPipeline.from_pretrained( + model_name, + export=True, + provider=provider, + use_io_binding=False, + ) + pipeline.save_pretrained(directory) - from diffusers import DDIMScheduler - from diffusion_models import PipelineInfo - from onnxruntime_cuda_txt2img import OnnxruntimeCudaStableDiffusionPipeline + if disable_safety_checker: + pipeline.safety_checker = None + pipeline.feature_extractor = None - pipeline_info = PipelineInfo(version) - model_name = pipeline_info.name() + return pipeline - scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler") - pipe = OnnxruntimeCudaStableDiffusionPipeline.from_pretrained( - model_name, - scheduler=scheduler, - requires_safety_checker=not disable_safety_checker, - enable_cuda_graph=enable_cuda_graph, - pipeline_info=pipeline_info, - ) - # re-use cached folder to save ONNX models - pipe.set_cached_folder(model_name) +def run_optimum_ort_pipeline( + pipe, + batch_size: int, + image_filename_prefix: str, + height, + width, + steps, + num_prompts, + batch_count, + start_memory, + memory_monitor_type, +): + from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline + + assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline)) - pipe = pipe.to("cuda", torch_dtype=torch.float16) + prompts = example_prompts() def warmup(): - pipe(["warm up"] * batch_size, image_height=height, image_width=width, num_inference_steps=steps) + pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size) - # Run warm up, and measure GPU memory of two runs - # The first run has algo search so it might need more memory + # Run warm up, and measure GPU memory of two runs. + # The first run has algo search for cuDNN/MIOpen, so it might need more memory. first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) - # An extra warm up run is needed for cuda graph warmup() - image_filename_prefix = get_image_filename_prefix("ort_cuda", model_name, batch_size, disable_safety_checker) - latency_list = [] - prompts, negative_prompt = example_prompts() for i, prompt in enumerate(prompts): if i >= num_prompts: break for j in range(batch_count): inference_start = time.time() images = pipe( - [prompt] * batch_size, - negative_prompt=[negative_prompt] * batch_size, + prompt, + height, + width, num_inference_steps=steps, + negative_prompt=None, + guidance_scale=0.0, # 7.5 + num_images_per_prompt=batch_size, ).images inference_end = time.time() latency = inference_end - inference_start @@ -373,11 +401,8 @@ def warmup(): from onnxruntime import __version__ as ort_version return { - "model_name": model_name, - "engine": "onnxruntime", + "engine": "optimum_ort", "version": ort_version, - "provider": provider.replace("ExecutionProvider", ""), - "directory": pipe.engine_dir, "height": height, "width": width, "steps": steps, @@ -388,13 +413,13 @@ def warmup(): "median_latency": statistics.median(latency_list), "first_run_memory_MB": first_run_memory, "second_run_memory_MB": second_run_memory, - "disable_safety_checker": disable_safety_checker, - "enable_cuda_graph": enable_cuda_graph, } -def run_ort_trt( - version: str, +def run_optimum_ort( + model_name: str, + directory: str, + provider: str, batch_size: int, disable_safety_checker: bool, height: int, @@ -404,92 +429,36 @@ def run_ort_trt( batch_count: int, start_memory, memory_monitor_type, - max_batch_size: int, - enable_cuda_graph: bool, ): - from diffusers import DDIMScheduler - from diffusion_models import PipelineInfo - from onnxruntime_tensorrt_txt2img import OnnxruntimeTensorRTStableDiffusionPipeline - - pipeline_info = PipelineInfo(version) - model_name = pipeline_info.name() - - assert batch_size <= max_batch_size + load_start = time.time() + pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker) + load_end = time.time() + print(f"Model loading took {load_end - load_start} seconds") - scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler") - pipe = OnnxruntimeTensorRTStableDiffusionPipeline.from_pretrained( - model_name, - revision="fp16", - torch_dtype=torch.float16, - scheduler=scheduler, - requires_safety_checker=not disable_safety_checker, - image_height=height, - image_width=width, - max_batch_size=max_batch_size, - onnx_opset=17, - enable_cuda_graph=enable_cuda_graph, - pipeline_info=pipeline_info, + image_filename_prefix = get_image_filename_prefix("optimum", model_name, batch_size, disable_safety_checker) + result = run_optimum_ort_pipeline( + pipe, + batch_size, + image_filename_prefix, + height, + width, + steps, + num_prompts, + batch_count, + start_memory, + memory_monitor_type, ) - # re-use cached folder to save ONNX models and TensorRT Engines - pipe.set_cached_folder(model_name, revision="fp16") - - pipe = pipe.to("cuda") - - def warmup(): - pipe(["warm up"] * batch_size, negative_prompt=["negative"] * batch_size, num_inference_steps=steps) - - # Run warm up, and measure GPU memory of two runs - # The first run has algo search so it might need more memory - first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) - second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) - - warmup() - - image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker) - - latency_list = [] - prompts, negative_prompt = example_prompts() - for i, prompt in enumerate(prompts): - if i >= num_prompts: - break - for j in range(batch_count): - inference_start = time.time() - images = pipe( - [prompt] * batch_size, - negative_prompt=[negative_prompt] * batch_size, - num_inference_steps=steps, - ).images - inference_end = time.time() - latency = inference_end - inference_start - latency_list.append(latency) - print(f"Inference took {latency:.3f} seconds") - for k, image in enumerate(images): - image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") - - from tensorrt import __version__ as trt_version - - from onnxruntime import __version__ as ort_version - - return { - "model_name": model_name, - "engine": "onnxruntime", - "version": ort_version, - "provider": f"tensorrt({trt_version})", - "directory": pipe.engine_dir, - "height": height, - "width": width, - "steps": steps, - "batch_size": batch_size, - "batch_count": batch_count, - "num_prompts": num_prompts, - "average_latency": sum(latency_list) / len(latency_list), - "median_latency": statistics.median(latency_list), - "first_run_memory_MB": first_run_memory, - "second_run_memory_MB": second_run_memory, - "disable_safety_checker": disable_safety_checker, - "enable_cuda_graph": enable_cuda_graph, - } + result.update( + { + "model_name": model_name, + "directory": directory, + "provider": provider.replace("ExecutionProvider", ""), + "disable_safety_checker": disable_safety_checker, + "enable_cuda_graph": False, + } + ) + return result def run_ort_trt_static( @@ -523,17 +492,16 @@ def run_ort_trt_static( short_name = pipeline_info.short_name() from engine_builder import EngineType, get_engine_paths - from pipeline_txt2img import Txt2ImgPipeline + from pipeline_stable_diffusion import StableDiffusionPipeline engine_type = EngineType.ORT_TRT onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type) # Initialize pipeline - pipeline = Txt2ImgPipeline( + pipeline = StableDiffusionPipeline( pipeline_info, scheduler="DDIM", output_dir=output_dir, - hf_token=None, verbose=False, nvtx_profile=nvtx_profile, max_batch_size=max_batch_size, @@ -551,7 +519,6 @@ def run_ort_trt_static( opt_image_height=height, opt_image_width=width, opt_batch_size=batch_size, - force_engine_rebuild=False, static_batch=True, static_image_shape=True, max_workspace_size=0, @@ -592,15 +559,11 @@ def warmup(): denoising_steps=steps, guidance=7.5, seed=123, - warmup=True, ) - images = pipeline.to_pil_image( - images - ) # include image conversion time to pil image for apple-to-apple compare inference_end = time.time() latency = inference_end - inference_start latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time:.1f} ms") + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") for k, image in enumerate(images): image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") @@ -664,7 +627,7 @@ def run_tensorrt_static( pipeline_info = PipelineInfo(version) from engine_builder import EngineType, get_engine_paths - from pipeline_txt2img import Txt2ImgPipeline + from pipeline_stable_diffusion import StableDiffusionPipeline engine_type = EngineType.TRT onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths( @@ -672,11 +635,10 @@ def run_tensorrt_static( ) # Initialize pipeline - pipeline = Txt2ImgPipeline( + pipeline = StableDiffusionPipeline( pipeline_info, scheduler="DDIM", output_dir=output_dir, - hf_token=None, verbose=False, nvtx_profile=nvtx_profile, max_batch_size=max_batch_size, @@ -693,16 +655,10 @@ def run_tensorrt_static( opt_batch_size=batch_size, opt_image_height=height, opt_image_width=width, - force_export=False, - force_optimize=False, - force_build=False, static_batch=True, static_shape=True, - enable_refit=False, - enable_preview=False, enable_all_tactics=False, timing_cache=timing_cache, - onnx_refit_dir=None, ) # activate engines @@ -744,15 +700,11 @@ def warmup(): denoising_steps=steps, guidance=7.5, seed=123, - warmup=True, ) - images = pipeline.to_pil_image( - images - ) # include image conversion time to pil image for apple-to-apple compare inference_end = time.time() latency = inference_end - inference_start latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time:.1f} ms") + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") for k, image in enumerate(images): image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg") @@ -828,7 +780,6 @@ def init_pipeline(pipeline_class, pipeline_info): pipeline_info, scheduler="DDIM", output_dir=output_dir, - hf_token=None, verbose=False, nvtx_profile=nvtx_profile, max_batch_size=max_batch_size, @@ -845,66 +796,39 @@ def init_pipeline(pipeline_class, pipeline_info): opt_batch_size=batch_size, opt_image_height=height, opt_image_width=width, - force_export=False, - force_optimize=False, - force_build=False, static_batch=True, static_shape=True, - enable_refit=False, - enable_preview=False, enable_all_tactics=False, timing_cache=timing_cache, - onnx_refit_dir=None, ) return pipeline - from pipeline_img2img_xl import Img2ImgXLPipeline - from pipeline_txt2img_xl import Txt2ImgXLPipeline - - base_pipeline_info = PipelineInfo(version) - demo_base = init_pipeline(Txt2ImgXLPipeline, base_pipeline_info) + from pipeline_stable_diffusion import StableDiffusionPipeline - refiner_pipeline_info = PipelineInfo(version, is_refiner=True) - demo_refiner = init_pipeline(Img2ImgXLPipeline, refiner_pipeline_info) + pipeline_info = PipelineInfo(version) + pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info) - max_device_memory = max(demo_base.backend.max_device_memory(), demo_refiner.backend.max_device_memory()) + max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory()) _, shared_device_memory = cudart.cudaMalloc(max_device_memory) - demo_base.backend.activate_engines(shared_device_memory) - demo_refiner.backend.activate_engines(shared_device_memory) + pipeline.backend.activate_engines(shared_device_memory) # Here we use static batch and image size, so the resource allocation only need done once. # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency. - demo_base.load_resources(image_height, image_width, batch_size) - demo_refiner.load_resources(image_height, image_width, batch_size) + pipeline.load_resources(image_height, image_width, batch_size) - def run_sd_xl_inference(prompt, negative_prompt, seed=None, warmup=False): - images, time_base = demo_base.run( + def run_sd_xl_inference(prompt, negative_prompt, seed=None): + return pipeline.run( prompt, negative_prompt, image_height, image_width, denoising_steps=steps, guidance=5.0, - warmup=warmup, seed=seed, - return_type="latent", ) - images, time_refiner = demo_refiner.run( - prompt, - negative_prompt, - images, - image_height, - image_width, - denoising_steps=steps, - guidance=5.0, - warmup=warmup, - seed=seed, - ) - return images, time_base + time_refiner - def warmup(): - run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size, warmup=True) + run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -913,7 +837,7 @@ def warmup(): warmup() - model_name = refiner_pipeline_info.name() + model_name = pipeline_info.name() image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker) latency_list = [] @@ -926,23 +850,17 @@ def warmup(): # Use warmup mode here since non-warmup mode will save image to disk. if nvtx_profile: cudart.cudaProfilerStart() - images, pipeline_time = run_sd_xl_inference( - [prompt] * batch_size, [negative_prompt] * batch_size, seed=123, warmup=True - ) + images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) if nvtx_profile: cudart.cudaProfilerStop() - images = demo_refiner.to_pil_image( - images - ) # include image conversion time to pil image for apple-to-apple compare inference_end = time.time() latency = inference_end - inference_start latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time:.1f} ms") + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") for k, image in enumerate(images): image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png") - demo_base.teardown() - demo_refiner.teardown() + pipeline.teardown() return { "model_name": model_name, @@ -979,97 +897,39 @@ def run_ort_trt_xl( nvtx_profile: bool = False, use_cuda_graph=True, ): - from cuda import cudart + from demo_utils import initialize_pipeline + from engine_builder import EngineType + + pipeline = initialize_pipeline( + version=version, + engine_type=EngineType.ORT_TRT, + work_dir=work_dir, + height=height, + width=width, + use_cuda_graph=use_cuda_graph, + max_batch_size=max_batch_size, + opt_batch_size=batch_size, + ) - # Validate image dimensions - image_height = height - image_width = width - if image_height % 8 != 0 or image_width % 8 != 0: - raise ValueError( - f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}." - ) + from cuda import cudart assert batch_size <= max_batch_size - from engine_builder import EngineType, get_engine_paths - - def init_pipeline(pipeline_class, pipeline_info): - engine_type = EngineType.ORT_TRT - - onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths( - work_dir, pipeline_info, engine_type - ) - - # Initialize pipeline - pipeline = pipeline_class( - pipeline_info, - scheduler="DDIM", - output_dir=output_dir, - hf_token=None, - verbose=False, - nvtx_profile=nvtx_profile, - max_batch_size=max_batch_size, - use_cuda_graph=use_cuda_graph, - framework_model_dir=framework_model_dir, - engine_type=engine_type, - ) - - pipeline.backend.build_engines( - engine_dir, - framework_model_dir, - onnx_dir, - 17, - opt_image_height=height, - opt_image_width=width, - opt_batch_size=batch_size, - force_engine_rebuild=False, - static_batch=True, - static_image_shape=True, - max_workspace_size=0, - device_id=torch.cuda.current_device(), # TODO: might not work with CUDA_VISIBLE_DEVICES - ) - return pipeline - - from diffusion_models import PipelineInfo - from pipeline_img2img_xl import Img2ImgXLPipeline - from pipeline_txt2img_xl import Txt2ImgXLPipeline - - base_pipeline_info = PipelineInfo(version) - demo_base = init_pipeline(Txt2ImgXLPipeline, base_pipeline_info) - - refiner_pipeline_info = PipelineInfo(version, is_refiner=True) - demo_refiner = init_pipeline(Img2ImgXLPipeline, refiner_pipeline_info) - - demo_base.load_resources(image_height, image_width, batch_size) - demo_refiner.load_resources(image_height, image_width, batch_size) + pipeline.load_resources(height, width, batch_size) - def run_sd_xl_inference(prompt, negative_prompt, seed=None, warmup=False): - images, time_base = demo_base.run( + def run_sd_xl_inference(prompt, negative_prompt, seed=None): + return pipeline.run( prompt, negative_prompt, - image_height, - image_width, - denoising_steps=steps, - guidance=5.0, - warmup=warmup, - seed=seed, - return_type="latent", - ) - images, time_refiner = demo_refiner.run( - prompt, - negative_prompt, - images, - image_height, - image_width, + height, + width, denoising_steps=steps, guidance=5.0, - warmup=warmup, seed=seed, ) - return images, time_base + time_refiner def warmup(): - run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size, warmup=True) + run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size) # Run warm up, and measure GPU memory of two runs # The first run has algo search so it might need more memory @@ -1078,7 +938,7 @@ def warmup(): warmup() - model_name = refiner_pipeline_info.name() + model_name = pipeline.pipeline_info.name() image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker) latency_list = [] @@ -1091,25 +951,19 @@ def warmup(): # Use warmup mode here since non-warmup mode will save image to disk. if nvtx_profile: cudart.cudaProfilerStart() - images, pipeline_time = run_sd_xl_inference( - [prompt] * batch_size, [negative_prompt] * batch_size, seed=123, warmup=True - ) + images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123) if nvtx_profile: cudart.cudaProfilerStop() - images = demo_refiner.to_pil_image( - images - ) # include image conversion time to pil image for apple-to-apple compare inference_end = time.time() latency = inference_end - inference_start latency_list.append(latency) - print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time:.1f} ms") + print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}") for k, image in enumerate(images): filename = f"{image_filename_prefix}_{i}_{j}_{k}.png" image.save(filename) print("Image saved to", filename) - demo_base.teardown() - demo_refiner.teardown() + pipeline.teardown() from tensorrt import __version__ as trt_version @@ -1209,7 +1063,7 @@ def parse_arguments(): required=False, type=str, default="onnxruntime", - choices=["onnxruntime", "torch", "tensorrt"], + choices=["onnxruntime", "optimum", "torch", "tensorrt"], help="Engines to benchmark. Default is onnxruntime.", ) @@ -1423,26 +1277,6 @@ def main(): nvtx_profile=False, use_cuda_graph=args.enable_cuda_graph, ) - elif args.tuning: - print( - "Testing OnnxruntimeTensorRTStableDiffusionPipeline with {}.".format( - "static input shape" if args.enable_cuda_graph else "dynamic batch size" - ) - ) - result = run_ort_trt( - version=args.version, - batch_size=args.batch_size, - disable_safety_checker=not args.enable_safety_checker, - height=args.height, - width=args.width, - steps=args.steps, - num_prompts=args.num_prompts, - batch_count=args.batch_count, - start_memory=start_memory, - memory_monitor_type=memory_monitor_type, - max_batch_size=args.max_trt_batch_size, - enable_cuda_graph=args.enable_cuda_graph, - ) else: print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.") result = run_ort_trt_static( @@ -1461,15 +1295,13 @@ def main(): nvtx_profile=False, use_cuda_graph=args.enable_cuda_graph, ) + elif args.engine == "optimum" and provider == "CUDAExecutionProvider": + if "xl" in args.version: + os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1" - elif args.engine == "onnxruntime" and provider == "CUDAExecutionProvider" and args.pipeline is None: - print( - "Testing OnnxruntimeCudaStableDiffusionPipeline with {} input shape. Backend is ORT CUDA EP.".format( - "static" if args.enable_cuda_graph else "dynamic" - ) - ) - result = export_and_run_ort( - version=args.version, + result = run_optimum_ort( + model_name=sd_model, + directory=args.pipeline, provider=provider, batch_size=args.batch_size, disable_safety_checker=not args.enable_safety_checker, @@ -1480,7 +1312,6 @@ def main(): batch_count=args.batch_count, start_memory=start_memory, memory_monitor_type=memory_monitor_type, - enable_cuda_graph=args.enable_cuda_graph, ) elif args.engine == "onnxruntime": assert args.pipeline and os.path.isdir( diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_controlnet.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_controlnet.py index 39b963313ea64..52c64fb7e8e0b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_controlnet.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_controlnet.py @@ -9,6 +9,7 @@ from statistics import mean import torch +from demo_utils import PipelineInfo from diffusers import ( AutoencoderKL, ControlNetModel, @@ -16,6 +17,8 @@ EulerAncestralDiscreteScheduler, StableDiffusionXLControlNetPipeline, ) +from engine_builder import EngineType, get_engine_paths +from pipeline_stable_diffusion import StableDiffusionPipeline """ Benchmark script for SDXL-Turbo with control net for engines like PyTorch or Stable Fast. @@ -120,6 +123,111 @@ def load_pipeline(name, engine, use_control_net=False, use_nhwc=False, enable_cu return pipeline +def get_prompt(): + return "little cute gremlin wearing a jacket, cinematic, vivid colors, intricate masterpiece, golden ratio, highly detailed" + + +def load_ort_cuda_pipeline(name, engine, use_control_net=False, enable_cuda_graph=True, work_dir="."): + version = PipelineInfo.supported_models()[name] + guidance_scale = 0.0 + pipeline_info = PipelineInfo( + version, + use_vae=True, + use_fp16_vae=True, + do_classifier_free_guidance=(guidance_scale > 1.0), + controlnet=["canny"] if use_control_net else [], + ) + + engine_type = EngineType.ORT_CUDA if engine == "ort_cuda" else EngineType.ORT_TRT + onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths( + work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type + ) + + pipeline = StableDiffusionPipeline( + pipeline_info, + scheduler="EulerA", + max_batch_size=32, + use_cuda_graph=enable_cuda_graph, + framework_model_dir=framework_model_dir, + output_dir=output_dir, + engine_type=engine_type, + ) + + pipeline.backend.build_engines( + engine_dir=engine_dir, + framework_model_dir=framework_model_dir, + onnx_dir=onnx_dir, + device_id=torch.cuda.current_device(), + ) + + return pipeline + + +def test_ort_cuda( + pipeline, + batch_size=1, + steps=4, + control_image=None, + warmup_runs=3, + test_runs=10, + seed=123, + verbose=False, + image_height=512, + image_width=512, +): + if batch_size > 4 and pipeline.pipeline_info.version == "xl-1.0": + pipeline.backend.enable_vae_slicing() + + pipeline.load_resources(image_height, image_width, batch_size) + + warmup_prompt = "warm up" + for _ in range(warmup_runs): + images, _ = pipeline.run( + [warmup_prompt] * batch_size, + [""] * batch_size, + image_height=image_height, + image_width=image_width, + denoising_steps=steps, + guidance=0.0, + seed=seed, + controlnet_images=[control_image], + controlnet_scales=torch.FloatTensor([0.5]), + output_type="image", + ) + assert len(images) == batch_size + + generator = torch.Generator(device="cuda") + generator.manual_seed(seed) + + prompt = get_prompt() + + latency_list = [] + images = None + for _ in range(test_runs): + torch.cuda.synchronize() + start_time = time.perf_counter() + images, _ = pipeline.run( + [prompt] * batch_size, + [""] * batch_size, + image_height=image_height, + image_width=image_width, + denoising_steps=steps, + guidance=0.0, + seed=seed, + controlnet_images=[control_image], + controlnet_scales=torch.FloatTensor([0.5]), + output_type="pil", + ) + torch.cuda.synchronize() + seconds = time.perf_counter() - start_time + latency_list.append(seconds) + + if verbose: + print(latency_list) + + return images, latency_list + + def test(pipeline, batch_size=1, steps=4, control_image=None, warmup_runs=3, test_runs=10, seed=123, verbose=False): control_net_args = {} if hasattr(pipeline, "controlnet"): @@ -130,33 +238,33 @@ def test(pipeline, batch_size=1, steps=4, control_image=None, warmup_runs=3, tes warmup_prompt = "warm up" for _ in range(warmup_runs): - image = pipeline( + images = pipeline( prompt=warmup_prompt, num_inference_steps=steps, num_images_per_prompt=batch_size, guidance_scale=0.0, **control_net_args, ).images - assert len(image) == batch_size + assert len(images) == batch_size generator = torch.Generator(device="cuda") generator.manual_seed(seed) - prompt = "little cute gremlin wearing a jacket, cinematic, vivid colors, intricate masterpiece, golden ratio, highly detailed" + prompt = get_prompt() latency_list = [] - image = None + images = None for _ in range(test_runs): torch.cuda.synchronize() start_time = time.perf_counter() - image = pipeline( + images = pipeline( prompt=prompt, num_inference_steps=steps, num_images_per_prompt=batch_size, guidance_scale=0.0, generator=generator, **control_net_args, - ).images[0] + ).images torch.cuda.synchronize() seconds = time.perf_counter() - start_time latency_list.append(seconds) @@ -164,7 +272,7 @@ def test(pipeline, batch_size=1, steps=4, control_image=None, warmup_runs=3, tes if verbose: print(latency_list) - return image, latency_list + return images, latency_list def arguments(): @@ -175,17 +283,25 @@ def arguments(): "--engine", type=str, default="torch", - choices=["torch", "stable_fast"], - help="Backend engine: torch or stable_fast", + choices=["torch", "stable_fast", "ort_cuda", "ort_trt"], + help="Backend engine: torch, stable_fast or ort_cuda", ) parser.add_argument( "--name", type=str, + choices=list(PipelineInfo.supported_models().keys()), default="stabilityai/sdxl-turbo", help="Stable diffusion model name. Default is stabilityai/sdxl-turbo", ) + parser.add_argument( + "--work-dir", + type=str, + default=".", + help="working directory for ort_cuda or ort_trt", + ) + parser.add_argument( "--use_control_net", action="store_true", @@ -239,21 +355,39 @@ def main(): args = arguments() with torch.no_grad(): - pipeline = load_pipeline( - args.name, - args.engine, - use_control_net=args.use_control_net, - use_nhwc=args.use_nhwc, - enable_cuda_graph=args.enable_cuda_graph, - ) + if args.engine == "ort_cuda": + pipeline = load_ort_cuda_pipeline( + args.name, + args.engine, + use_control_net=args.use_control_net, + enable_cuda_graph=args.enable_cuda_graph, + work_dir=args.work_dir, + ) + else: + pipeline = load_pipeline( + args.name, + args.engine, + use_control_net=args.use_control_net, + use_nhwc=args.use_nhwc, + enable_cuda_graph=args.enable_cuda_graph, + ) canny_image = get_canny_image() - if args.engine == "stable_fast": + if args.engine == "ort_cuda": + images, latency_list = test_ort_cuda( + pipeline, + args.batch_size, + args.steps, + control_image=canny_image, + warmup_runs=args.warmup_runs, + verbose=args.verbose, + ) + elif args.engine == "stable_fast": from sfast.utils.compute_precision import low_compute_precision with low_compute_precision(): - image, latency_list = test( + images, latency_list = test( pipeline, args.batch_size, args.steps, @@ -262,7 +396,7 @@ def main(): verbose=args.verbose, ) else: - image, latency_list = test( + images, latency_list = test( pipeline, args.batch_size, args.steps, @@ -272,8 +406,8 @@ def main(): ) # Save the first output image to inspect the result. - if image: - image.save( + if images: + images[0].save( f"{args.engine}_{args.name.replace('/', '_')}_{args.batch_size}_{args.steps}_c{int(args.use_control_net)}.png" ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py index c18747d5c6518..40692701c28d6 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py @@ -26,15 +26,11 @@ add_controlnet_arguments, arg_parser, get_metadata, - init_pipeline, - max_batch, + load_pipelines, parse_arguments, process_controlnet_arguments, repeat_prompt, ) -from diffusion_models import PipelineInfo -from engine_builder import EngineType, get_engine_type -from pipeline_txt2img import Txt2ImgPipeline if __name__ == "__main__": coloredlogs.install(fmt="%(funcName)20s: %(message)s") @@ -45,83 +41,26 @@ controlnet_images, controlnet_scale = process_controlnet_arguments(args) - prompt, negative_prompt = repeat_prompt(args) - - image_height = args.height - image_width = args.width - - # Register TensorRT plugins - engine_type = get_engine_type(args.engine) - if engine_type == EngineType.TRT: - from trt_utilities import init_trt_plugins - - init_trt_plugins() - - max_batch_size = max_batch(args) + pipeline, refiner = load_pipelines(args) + assert refiner is None + prompt, negative_prompt = repeat_prompt(args) batch_size = len(prompt) - if batch_size > max_batch_size: - raise ValueError( - f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4" - ) - - # For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size. - # Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance. - # This range can cover common used shape of landscape 512x768, portrait 768x512, or square 512x512 and 768x768. - min_image_size = 512 if args.engine != "ORT_CUDA" else 256 - max_image_size = 768 if args.engine != "ORT_CUDA" else 1024 - pipeline_info = PipelineInfo( - args.version, - min_image_size=min_image_size, - max_image_size=max_image_size, - do_classifier_free_guidance=(args.guidance > 1.0), - controlnet=args.controlnet_type, - lora_weights=args.lora_weights, - lora_scale=args.lora_scale, - ) - - # Ideally, the optimized batch size and image size for TRT engine shall align with user's preference. That is to - # optimize the shape used most frequently. We can let user config it when we develop a UI plugin. - # In this demo, we optimize batch size 1 and image size 512x512 (or 768x768 for SD 2.0/2.1) for dynamic engine. - # This is mainly for benchmark purpose to simulate the case that we have no knowledge of user's preference. - opt_batch_size = 1 if args.build_dynamic_batch else batch_size - opt_image_height = pipeline_info.default_image_size() if args.build_dynamic_shape else args.height - opt_image_width = pipeline_info.default_image_size() if args.build_dynamic_shape else args.width - - pipeline = init_pipeline( - Txt2ImgPipeline, - pipeline_info, - engine_type, - args, - max_batch_size, - opt_batch_size, - opt_image_height, - opt_image_width, - ) - - if engine_type == EngineType.TRT: - max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory()) - _, shared_device_memory = cudart.cudaMalloc(max_device_memory) - pipeline.backend.activate_engines(shared_device_memory) - - if engine_type == EngineType.ORT_CUDA and args.enable_vae_slicing: - pipeline.backend.enable_vae_slicing() - - pipeline.load_resources(image_height, image_width, batch_size) + pipeline.load_resources(args.height, args.width, batch_size) def run_inference(warmup=False): return pipeline.run( prompt, negative_prompt, - image_height, - image_width, - warmup=warmup, + args.height, + args.width, denoising_steps=args.denoising_steps, guidance=args.guidance, seed=args.seed, controlnet_images=controlnet_images, controlnet_scales=controlnet_scale, - return_type="image", + show_latency=not warmup, + output_type="pil", ) if not args.disable_cuda_graph: diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py index b691f5115e6d3..19bbb45d77c93 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py @@ -26,107 +26,11 @@ add_controlnet_arguments, arg_parser, get_metadata, - init_pipeline, - max_batch, + load_pipelines, parse_arguments, process_controlnet_arguments, repeat_prompt, ) -from diffusion_models import PipelineInfo -from engine_builder import EngineType, get_engine_type -from pipeline_img2img_xl import Img2ImgXLPipeline -from pipeline_txt2img_xl import Txt2ImgXLPipeline - - -def load_pipelines(args, batch_size): - # Register TensorRT plugins - engine_type = get_engine_type(args.engine) - if engine_type == EngineType.TRT: - from trt_utilities import init_trt_plugins - - init_trt_plugins() - - max_batch_size = max_batch(args) - - if batch_size > max_batch_size: - raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.") - - # For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size. - # Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance. - # This range can cover most frequent shape of landscape (832x1216), portrait (1216x832) or square (1024x1024). - if args.version == "xl-turbo": - min_image_size = 512 - max_image_size = 768 if args.engine != "ORT_CUDA" else 1024 - else: - min_image_size = 832 if args.engine != "ORT_CUDA" else 512 - max_image_size = 1216 if args.engine != "ORT_CUDA" else 2048 - - # No VAE decoder in base when it outputs latent instead of image. - base_info = PipelineInfo( - args.version, - use_vae=not args.enable_refiner, - min_image_size=min_image_size, - max_image_size=max_image_size, - use_lcm=args.lcm, - do_classifier_free_guidance=(args.guidance > 1.0), - controlnet=args.controlnet_type, - lora_weights=args.lora_weights, - lora_scale=args.lora_scale, - ) - - # Ideally, the optimized batch size and image size for TRT engine shall align with user's preference. That is to - # optimize the shape used most frequently. We can let user config it when we develop a UI plugin. - # In this demo, we optimize batch size 1 and image size 1024x1024 for SD XL dynamic engine. - # This is mainly for benchmark purpose to simulate the case that we have no knowledge of user's preference. - opt_batch_size = 1 if args.build_dynamic_batch else batch_size - opt_image_height = base_info.default_image_size() if args.build_dynamic_shape else args.height - opt_image_width = base_info.default_image_size() if args.build_dynamic_shape else args.width - - base = init_pipeline( - Txt2ImgXLPipeline, - base_info, - engine_type, - args, - max_batch_size, - opt_batch_size, - opt_image_height, - opt_image_width, - ) - - refiner = None - if args.enable_refiner: - refiner_version = "xl-1.0" # Allow SDXL Turbo to use refiner. - refiner_info = PipelineInfo( - refiner_version, is_refiner=True, min_image_size=min_image_size, max_image_size=max_image_size - ) - refiner = init_pipeline( - Img2ImgXLPipeline, - refiner_info, - engine_type, - args, - max_batch_size, - opt_batch_size, - opt_image_height, - opt_image_width, - ) - - if engine_type == EngineType.TRT: - max_device_memory = max(base.backend.max_device_memory(), (refiner or base).backend.max_device_memory()) - _, shared_device_memory = cudart.cudaMalloc(max_device_memory) - base.backend.activate_engines(shared_device_memory) - if refiner: - refiner.backend.activate_engines(shared_device_memory) - - if engine_type == EngineType.ORT_CUDA: - enable_vae_slicing = args.enable_vae_slicing - if batch_size > 4 and not enable_vae_slicing and (args.height >= 1024 and args.width >= 1024): - print( - "Updating enable_vae_slicing to be True to avoid cuDNN error for batch size > 4 and resolution >= 1024." - ) - enable_vae_slicing = True - if enable_vae_slicing: - (refiner or base).backend.enable_vae_slicing() - return base, refiner def run_pipelines( @@ -145,13 +49,13 @@ def run_base_and_refiner(warmup=False): negative_prompt, image_height, image_width, - warmup=warmup, denoising_steps=args.denoising_steps, guidance=args.guidance, seed=args.seed, controlnet_images=controlnet_image, controlnet_scales=controlnet_scale, - return_type="latent" if refiner else "image", + show_latency=not warmup, + output_type="latent" if refiner else "pil", ) if refiner is None: return images, base_perf @@ -162,14 +66,14 @@ def run_base_and_refiner(warmup=False): images, refiner_perf = refiner.run( prompt, negative_prompt, - images, image_height, image_width, - warmup=warmup, denoising_steps=args.refiner_denoising_steps, + image=images, strength=args.strength, guidance=args.refiner_guidance, seed=seed, + show_latency=not warmup, ) perf_data = None @@ -309,6 +213,32 @@ def run_dynamic_shape_demo(args): refiner.teardown() +def run_turbo_demo(args): + """Run demo of generating images with test prompts with ORT CUDA provider.""" + args.engine = "ORT_CUDA" + args.disable_cuda_graph = True + base, refiner = load_pipelines(args, 1) + + from datasets import load_dataset + + dataset = load_dataset("Gustavosta/Stable-Diffusion-Prompts") + num_rows = dataset["test"].num_rows + batch_size = args.batch_size + num_batch = int(num_rows / batch_size) + args.batch_size = 1 + for i in range(num_batch): + args.prompt = [dataset["test"][i]["Prompt"] for i in range(i * batch_size, (i + 1) * batch_size)] + base.set_scheduler(args.scheduler) + if refiner: + refiner.set_scheduler(args.refiner_scheduler) + prompt, negative_prompt = repeat_prompt(args) + run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False) + + base.teardown() + if refiner: + refiner.teardown() + + if __name__ == "__main__": coloredlogs.install(fmt="%(funcName)20s: %(message)s") @@ -318,6 +248,9 @@ def run_dynamic_shape_demo(args): no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0] if no_prompt: - run_dynamic_shape_demo(args) + if args.version == "xl-turbo": + run_turbo_demo(args) + else: + run_dynamic_shape_demo(args) else: run_demo(args) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py index c0395b5e4642f..965a2598a2488 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py @@ -29,9 +29,11 @@ import cv2 import numpy as np import torch +from cuda import cudart from diffusion_models import PipelineInfo -from engine_builder import EngineType, get_engine_paths +from engine_builder import EngineType, get_engine_paths, get_engine_type from PIL import Image +from pipeline_stable_diffusion import StableDiffusionPipeline class RawTextArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter): @@ -40,7 +42,8 @@ class RawTextArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatte def arg_parser(description: str): return argparse.ArgumentParser( - description=description, formatter_class=RawTextArgumentDefaultsHelpFormatter, add_help=False + description=description, + formatter_class=RawTextArgumentDefaultsHelpFormatter, ) @@ -65,8 +68,7 @@ def set_default_arguments(args): def parse_arguments(is_xl: bool, parser): - engines = ["ORT_CUDA", "ORT_TRT", "TRT"] - parser.add_argument("--help", action="store_true", help="show this help message and exit") + engines = ["ORT_CUDA", "ORT_TRT", "TRT", "TORCH"] parser.add_argument( "-e", @@ -89,14 +91,14 @@ def parse_arguments(is_xl: bool, parser): ) parser.add_argument( - "-h", + "-y", "--height", type=int, default=None, help="Height of image to generate (must be multiple of 8).", ) parser.add_argument( - "-w", "--width", type=int, default=None, help="Height of image to generate (must be multiple of 8)." + "-x", "--width", type=int, default=None, help="Height of image to generate (must be multiple of 8)." ) parser.add_argument( @@ -115,6 +117,13 @@ def parse_arguments(is_xl: bool, parser): help="Root Directory to store torch or ONNX models, built engines and output images etc.", ) + parser.add_argument( + "-i", + "--engine-dir", + default=None, + help="Root Directory to store built engines or optimized ONNX models etc.", + ) + parser.add_argument("prompt", nargs="*", default=[""], help="Text prompt(s) to guide image generation.") parser.add_argument( @@ -208,23 +217,8 @@ def parse_arguments(is_xl: bool, parser): choices=range(14, 18), help="Select ONNX opset version to target for exported models.", ) - parser.add_argument( - "--force-onnx-export", action="store_true", help="Force ONNX export of CLIP, UNET, and VAE models." - ) - parser.add_argument( - "--force-onnx-optimize", action="store_true", help="Force ONNX optimizations for CLIP, UNET, and VAE models." - ) - - # Framework model ckpt - parser.add_argument( - "--framework-model-dir", - default="pytorch_model", - help="Directory for HF saved models. Default is pytorch_model.", - ) - parser.add_argument("--hf-token", type=str, help="HuggingFace API access token for downloading model checkpoints.") # Engine build options. - parser.add_argument("--force-engine-build", action="store_true", help="Force rebuilding the TensorRT engine.") parser.add_argument( "-db", "--build-dynamic-batch", @@ -252,34 +246,14 @@ def parse_arguments(is_xl: bool, parser): # TensorRT only options group = parser.add_argument_group("Options for TensorRT (--engine=TRT) only") - group.add_argument("--onnx-refit-dir", help="ONNX models to load the weights from.") - group.add_argument( - "--build-enable-refit", action="store_true", help="Enable Refit option in TensorRT engines during build." - ) - group.add_argument( - "--build-preview-features", action="store_true", help="Build TensorRT engines with preview features." - ) group.add_argument( "--build-all-tactics", action="store_true", help="Build TensorRT engines using all tactic sources." ) args = parser.parse_args() - if args.help: - parser.print_help() - sys.exit() set_default_arguments(args) - if ( - args.engine in ["ORT_CUDA", "ORT_TRT"] - and (args.force_onnx_export or args.force_onnx_optimize) - and not args.force_engine_build - ): - raise ValueError( - "For ORT_CUDA or ORT_TRT, --force_onnx_export and --force_onnx_optimize are not supported. " - "Please use --force_engine_build instead." - ) - # Validate image dimensions if args.height % 64 != 0 or args.width % 64 != 0: raise ValueError( @@ -404,79 +378,224 @@ def repeat_prompt(args): return prompt, negative_prompt -def init_pipeline( - pipeline_class, pipeline_info, engine_type, args, max_batch_size, opt_batch_size, opt_image_height, opt_image_width +def initialize_pipeline( + version="xl-turbo", + is_refiner: bool = False, + is_inpaint: bool = False, + engine_type=EngineType.ORT_CUDA, + work_dir: str = ".", + engine_dir=None, + onnx_opset: int = 17, + scheduler="EulerA", + height=512, + width=512, + nvtx_profile=False, + use_cuda_graph=True, + build_dynamic_batch=False, + build_dynamic_shape=False, + min_image_size: int = 512, + max_image_size: int = 1024, + max_batch_size: int = 16, + opt_batch_size: int = 1, + build_all_tactics=False, + do_classifier_free_guidance=False, + lcm=False, + controlnet=None, + lora_weights=None, + lora_scale=1.0, + use_fp16_vae=True, + use_vae=True, ): + pipeline_info = PipelineInfo( + version, + is_refiner=is_refiner, + is_inpaint=is_inpaint, + use_vae=use_vae, + min_image_size=min_image_size, + max_image_size=max_image_size, + use_fp16_vae=use_fp16_vae, + use_lcm=lcm, + do_classifier_free_guidance=do_classifier_free_guidance, + controlnet=controlnet, + lora_weights=lora_weights, + lora_scale=lora_scale, + ) + + input_engine_dir = engine_dir + onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths( - work_dir=args.work_dir, pipeline_info=pipeline_info, engine_type=engine_type + work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type ) - # Initialize demo - pipeline = pipeline_class( + pipeline = StableDiffusionPipeline( pipeline_info, - scheduler=args.refiner_scheduler if pipeline_info.is_xl_refiner() else args.scheduler, + scheduler=scheduler, output_dir=output_dir, - hf_token=args.hf_token, verbose=False, - nvtx_profile=args.nvtx_profile, + nvtx_profile=nvtx_profile, max_batch_size=max_batch_size, - use_cuda_graph=not args.disable_cuda_graph, + use_cuda_graph=use_cuda_graph, framework_model_dir=framework_model_dir, engine_type=engine_type, ) + import_engine_dir = None + if input_engine_dir: + if not os.path.exists(input_engine_dir): + raise RuntimeError(f"--engine_dir directory does not exist: {input_engine_dir}") + + # Support importing from optimized diffusers onnx pipeline + if engine_type == EngineType.ORT_CUDA and os.path.exists(os.path.join(input_engine_dir, "model_index.json")): + import_engine_dir = input_engine_dir + else: + engine_dir = input_engine_dir + + opt_image_height = pipeline_info.default_image_size() if build_dynamic_shape else height + opt_image_width = pipeline_info.default_image_size() if build_dynamic_shape else width + if engine_type == EngineType.ORT_CUDA: - # Build CUDA EP engines and load pytorch modules pipeline.backend.build_engines( engine_dir=engine_dir, framework_model_dir=framework_model_dir, onnx_dir=onnx_dir, - tmp_dir=os.path.join(args.work_dir or ".", engine_type.name, pipeline_info.short_name(), "tmp"), - force_engine_rebuild=args.force_engine_build, + tmp_dir=os.path.join(work_dir or ".", engine_type.name, pipeline_info.short_name(), "tmp"), device_id=torch.cuda.current_device(), + import_engine_dir=import_engine_dir, ) elif engine_type == EngineType.ORT_TRT: - # Build TensorRT EP engines and load pytorch modules pipeline.backend.build_engines( engine_dir, framework_model_dir, onnx_dir, - args.onnx_opset, + onnx_opset, opt_image_height=opt_image_height, opt_image_width=opt_image_width, opt_batch_size=opt_batch_size, - force_engine_rebuild=args.force_engine_build, - static_batch=not args.build_dynamic_batch, - static_image_shape=not args.build_dynamic_shape, + static_batch=not build_dynamic_batch, + static_image_shape=not build_dynamic_shape, max_workspace_size=0, device_id=torch.cuda.current_device(), timing_cache=timing_cache, ) elif engine_type == EngineType.TRT: - # Load TensorRT engines and pytorch modules pipeline.backend.load_engines( engine_dir, framework_model_dir, onnx_dir, - args.onnx_opset, + onnx_opset, opt_batch_size=opt_batch_size, opt_image_height=opt_image_height, opt_image_width=opt_image_width, - force_export=args.force_onnx_export, - force_optimize=args.force_onnx_optimize, - force_build=args.force_engine_build, - static_batch=not args.build_dynamic_batch, - static_shape=not args.build_dynamic_shape, - enable_refit=args.build_enable_refit, - enable_preview=args.build_preview_features, - enable_all_tactics=args.build_all_tactics, + static_batch=not build_dynamic_batch, + static_shape=not build_dynamic_shape, + enable_all_tactics=build_all_tactics, timing_cache=timing_cache, - onnx_refit_dir=args.onnx_refit_dir, ) + elif engine_type == EngineType.TORCH: + pipeline.backend.build_engines(framework_model_dir) + else: + raise RuntimeError("invalid engine type") return pipeline +def load_pipelines(args, batch_size=None): + engine_type = get_engine_type(args.engine) + + # Register TensorRT plugins + if engine_type == EngineType.TRT: + from trt_utilities import init_trt_plugins + + init_trt_plugins() + + max_batch_size = max_batch(args) + + if batch_size is None: + assert isinstance(args.prompt, list) + batch_size = len(args.prompt) * args.batch_size + + if batch_size > max_batch_size: + raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.") + + # For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size. + # Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance. + # This range can cover most frequent shape of landscape (832x1216), portrait (1216x832) or square (1024x1024). + if args.version == "xl-turbo": + min_image_size = 512 + max_image_size = 768 if args.engine != "ORT_CUDA" else 1024 + elif args.version == "xl-1.0": + min_image_size = 832 if args.engine != "ORT_CUDA" else 512 + max_image_size = 1216 if args.engine != "ORT_CUDA" else 2048 + else: + # This range can cover common used shape of landscape 512x768, portrait 768x512, or square 512x512 and 768x768. + min_image_size = 512 if args.engine != "ORT_CUDA" else 256 + max_image_size = 768 if args.engine != "ORT_CUDA" else 1024 + + params = { + "version": args.version, + "is_refiner": False, + "is_inpaint": False, + "engine_type": engine_type, + "work_dir": args.work_dir, + "engine_dir": args.engine_dir, + "onnx_opset": args.onnx_opset, + "scheduler": args.scheduler, + "height": args.height, + "width": args.width, + "nvtx_profile": args.nvtx_profile, + "use_cuda_graph": not args.disable_cuda_graph, + "build_dynamic_batch": args.build_dynamic_batch, + "build_dynamic_shape": args.build_dynamic_shape, + "min_image_size": min_image_size, + "max_image_size": max_image_size, + "max_batch_size": max_batch_size, + "opt_batch_size": 1 if args.build_dynamic_batch else batch_size, + "build_all_tactics": args.build_all_tactics, + "do_classifier_free_guidance": args.guidance > 1.0, + "controlnet": args.controlnet_type, + "lora_weights": args.lora_weights, + "lora_scale": args.lora_scale, + "use_fp16_vae": "xl" in args.version, + "use_vae": True, + } + + if "xl" in args.version: + params["lcm"] = args.lcm + params["use_vae"] = not args.enable_refiner + base = initialize_pipeline(**params) + + refiner = None + if "xl" in args.version and args.enable_refiner: + params["version"] = "xl-1.0" # Allow SDXL Turbo to use refiner. + params["is_refiner"] = True + params["scheduler"] = args.refiner_scheduler + params["do_classifier_free_guidance"] = args.refiner_guidance > 1.0 + params["lcm"] = False + params["controlnet"] = None + params["lora_weights"] = None + params["use_vae"] = True + params["use_fp16_vae"] = True + refiner = initialize_pipeline(**params) + + if engine_type == EngineType.TRT: + max_device_memory = max(base.backend.max_device_memory(), (refiner or base).backend.max_device_memory()) + _, shared_device_memory = cudart.cudaMalloc(max_device_memory) + base.backend.activate_engines(shared_device_memory) + if refiner: + refiner.backend.activate_engines(shared_device_memory) + + if engine_type == EngineType.ORT_CUDA: + enable_vae_slicing = args.enable_vae_slicing + if batch_size > 4 and not enable_vae_slicing and (args.height >= 1024 and args.width >= 1024): + print( + "Updating enable_vae_slicing to be True to avoid cuDNN error for batch size > 4 and resolution >= 1024." + ) + enable_vae_slicing = True + if enable_vae_slicing: + (refiner or base).backend.enable_vae_slicing() + return base, refiner + + def get_depth_image(image): """ Create depth map for SDXL depth control net. @@ -542,7 +661,7 @@ def add_controlnet_arguments(parser, is_xl: bool = False): """ Add control net related arguments. """ - group = parser.add_argument_group("Options for ControlNet (only supports SD 1.5 or XL).") + group = parser.add_argument_group("Options for ControlNet (supports 1.5, sd-turbo, xl-turbo, xl-1.0).") group.add_argument( "-ci", @@ -622,7 +741,7 @@ def process_controlnet_arguments(args): if len(args.controlnet_type) == 0: return None, None - if args.version not in ["1.5", "xl-1.0", "xl-turbo"]: + if args.version not in ["1.5", "xl-1.0", "xl-turbo", "sd-turbo"]: raise ValueError("This demo only supports ControlNet in Stable Diffusion 1.5, XL or Turbo.") is_xl = "xl" in args.version diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py index 9f3c5a8c938c6..10af22e44d3a5 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py @@ -87,7 +87,7 @@ def __init__( version: str, is_inpaint: bool = False, is_refiner: bool = False, - use_vae=False, + use_vae=True, # TODO: this has couple with output type of pipeline min_image_size=256, max_image_size=1024, use_fp16_vae=True, @@ -161,6 +161,23 @@ def custom_unet(self) -> Optional[str]: def supported_versions(is_xl: bool): return ["xl-1.0", "xl-turbo"] if is_xl else ["1.4", "1.5", "2.0-base", "2.0", "2.1", "2.1-base", "sd-turbo"] + @staticmethod + def supported_models(): + return { + "CompVis/stable-diffusion-v1-4": "1.4", + "runwayml/stable-diffusion-v1-5": "1.5", + "stabilityai/stable-diffusion-2-base": "2.0-base", + "stabilityai/stable-diffusion-2": "2.0", + "stabilityai/stable-diffusion-2-1": "2.1", + "stabilityai/stable-diffusion-2-1-base": "2.1", + "stabilityai/stable-diffusion-xl-base-1.0": "xl-1.0", + "stabilityai/stable-diffusion-xl-refiner-1.0": "xl-1.0", + "stabilityai/sdxl-turbo": "xl-turbo", + "stabilityai/sd-turbo": "sd-turbo", + # "runwayml/stable-diffusion-inpainting": "1.5", + # "stabilityai/stable-diffusion-2-inpainting": "2.0", + } + def name(self) -> str: if self.version == "1.4": if self.is_inpaint(): @@ -329,7 +346,7 @@ def get_ort_optimizer(self): def get_model(self): return self.model - def from_pretrained(self, model_class, framework_model_dir, hf_token, subfolder=None, model_name=None, **kwargs): + def from_pretrained(self, model_class, framework_model_dir, subfolder=None, model_name=None, **kwargs): if model_name is None: model_name = self.pipeline_info.name() @@ -343,7 +360,6 @@ def from_pretrained(self, model_class, framework_model_dir, hf_token, subfolder= model_name, subfolder=subfolder, use_safetensors=self.pipeline_info.use_safetensors(), - use_auth_token=hf_token, **kwargs, ).to(self.device) model.save_pretrained(model_dir) @@ -353,7 +369,7 @@ def from_pretrained(self, model_class, framework_model_dir, hf_token, subfolder= model = model_class.from_pretrained(model_dir).to(self.device) return model - def load_model(self, framework_model_dir: str, hf_token: str, subfolder: str): + def load_model(self, framework_model_dir: str, subfolder: str): pass def get_input_names(self) -> List[str]: @@ -405,8 +421,7 @@ def get_shape_dict(self, batch_size, image_height, image_width): def fp32_input_output_names(self) -> List[str]: """For CUDA EP, we export ONNX model with FP32 first, then convert it to mixed precision model. - This is a list of input or output names that are kept as float32 during converting. - For the first version, we will use same data type as TensorRT. + This is a list of input or output names that are kept as float32 in optimized model. """ return [] @@ -519,7 +534,7 @@ def get_output_names(self): return ["text_embeddings"] def get_dynamic_axes(self): - return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}} + return {"input_ids": {0: "B", 1: "S"}, "text_embeddings": {0: "B", 1: "S"}} def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_image_shape): self.check_dims(batch_size, image_height, image_width) @@ -581,7 +596,7 @@ def add_hidden_states_graph_output(self, model: ModelProto, optimized_onnx_path, onnx.helper.make_tensor_value_info( graph_output_name, graph.output[0].type.tensor_type.elem_type, - ["B", self.text_maxlen, self.embedding_dim], + ["B", "S", self.embedding_dim], ) ) @@ -660,8 +675,8 @@ def optimize_trt(self, input_onnx_path, optimized_onnx_path): else: onnx.save(onnx_opt_graph, optimized_onnx_path) - def load_model(self, framework_model_dir, hf_token, subfolder="text_encoder"): - return self.from_pretrained(CLIPTextModel, framework_model_dir, hf_token, subfolder) + def load_model(self, framework_model_dir, subfolder="text_encoder"): + return self.from_pretrained(CLIPTextModel, framework_model_dir, subfolder) class CLIPWithProj(CLIP): @@ -682,8 +697,8 @@ def __init__( clip_skip=clip_skip, ) - def load_model(self, framework_model_dir, hf_token, subfolder="text_encoder_2"): - return self.from_pretrained(CLIPTextModelWithProjection, framework_model_dir, hf_token, subfolder) + def load_model(self, framework_model_dir, subfolder="text_encoder_2"): + return self.from_pretrained(CLIPTextModelWithProjection, framework_model_dir, subfolder) def get_shape_dict(self, batch_size, image_height, image_width): self.check_dims(batch_size, image_height, image_width) @@ -816,10 +831,10 @@ def __init__( self.unet_dim = unet_dim self.controlnet = pipeline_info.controlnet_name() - def load_model(self, framework_model_dir, hf_token, subfolder="unet"): + def load_model(self, framework_model_dir, subfolder="unet"): options = {"variant": "fp16", "torch_dtype": torch.float16} - model = self.from_pretrained(UNet2DConditionModel, framework_model_dir, hf_token, subfolder, **options) + model = self.from_pretrained(UNet2DConditionModel, framework_model_dir, subfolder, **options) if self.controlnet: controlnet_list = [] @@ -827,7 +842,6 @@ def load_model(self, framework_model_dir, hf_token, subfolder="unet"): controlnet = self.from_pretrained( ControlNetModel, framework_model_dir, - hf_token, subfolder=None, model_name=name, torch_dtype=torch.float16, @@ -929,10 +943,8 @@ def get_sample_input(self, batch_size, image_height, image_width): dtype = torch.float16 if self.fp16 else torch.float32 m = self.get_batch_multiplier() output = ( - torch.randn( - m * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device - ), - torch.tensor([1.0], dtype=torch.float32, device=self.device), + torch.randn(m * batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device), + torch.tensor([1.0], dtype=dtype, device=self.device), torch.randn(m * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device), ) @@ -946,9 +958,6 @@ def get_sample_input(self, batch_size, image_height, image_width): ) return output - def fp32_input_output_names(self) -> List[str]: - return ["sample", "timestep"] - class UNetXL(BaseModel): def __init__( @@ -977,7 +986,7 @@ def __init__( self.custom_unet = pipeline_info.custom_unet() self.controlnet = pipeline_info.controlnet_name() - def load_model(self, framework_model_dir, hf_token, subfolder="unet", always_download_fp16=True): + def load_model(self, framework_model_dir, subfolder="unet", always_download_fp16=True): options = {"variant": "fp16", "torch_dtype": torch.float16} if self.fp16 or always_download_fp16 else {} if self.custom_unet: @@ -989,7 +998,7 @@ def load_model(self, framework_model_dir, hf_token, subfolder="unet", always_dow unet = UNet2DConditionModel.from_pretrained(model_dir, **options) model = unet.to(self.device) else: - model = self.from_pretrained(UNet2DConditionModel, framework_model_dir, hf_token, subfolder, **options) + model = self.from_pretrained(UNet2DConditionModel, framework_model_dir, subfolder, **options) if always_download_fp16 and not self.fp16: model = model.to(torch.float32) @@ -1107,9 +1116,9 @@ def get_sample_input(self, batch_size, image_height, image_width): if not self.controlnet: return ( torch.randn( - m * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device + m * batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device ), - torch.tensor([1.0], dtype=torch.float32, device=self.device), + torch.tensor([1.0], dtype=dtype, device=self.device), torch.randn(m * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device), { "added_cond_kwargs": { @@ -1122,9 +1131,9 @@ def get_sample_input(self, batch_size, image_height, image_width): # sample, timestep, encoder_hidden_states, text_embeds, time_ids, controlnet_images, controlnet_scales, return ( torch.randn( - m * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device + m * batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device ), - torch.tensor([1.0], dtype=torch.float32, device=self.device), + torch.tensor([1.0], dtype=dtype, device=self.device), torch.randn(m * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device), torch.randn(m * batch_size, 1280, dtype=dtype, device=self.device), torch.randn(m * batch_size, self.time_dim, dtype=dtype, device=self.device), @@ -1134,9 +1143,6 @@ def get_sample_input(self, batch_size, image_height, image_width): torch.randn(len(self.controlnet), dtype=dtype, device=self.device), ) - def fp32_input_output_names(self) -> List[str]: - return ["sample", "timestep"] - # VAE Decoder class VAE(BaseModel): @@ -1160,7 +1166,7 @@ def __init__( # For SD XL, need custom trained fp16 model to speed up, and avoid overflow at the same time. self.custom_fp16_vae = custom_fp16_vae - def load_model(self, framework_model_dir, hf_token: Optional[str] = None, subfolder: str = "vae_decoder"): + def load_model(self, framework_model_dir, subfolder: str = "vae_decoder"): model_name = self.custom_fp16_vae or self.pipeline_info.name() model_dir = os.path.join(framework_model_dir, model_name, subfolder) @@ -1172,7 +1178,6 @@ def load_model(self, framework_model_dir, hf_token: Optional[str] = None, subfol self.pipeline_info.name(), subfolder="vae", use_safetensors=self.pipeline_info.use_safetensors(), - use_auth_token=hf_token, ).to(self.device) vae.save_pretrained(model_dir) else: @@ -1225,13 +1230,14 @@ def get_shape_dict(self, batch_size, image_height, image_width): def get_sample_input(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) - return (torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device),) + dtype = torch.float16 if self.fp16 else torch.float32 + return (torch.randn(batch_size, 4, latent_height, latent_width, dtype=dtype, device=self.device),) def fp32_input_output_names(self) -> List[str]: - return [] if self.fp16 else ["latent", "images"] + return [] -def get_tokenizer(pipeline_info: PipelineInfo, framework_model_dir, hf_token, subfolder="tokenizer"): +def get_tokenizer(pipeline_info: PipelineInfo, framework_model_dir, subfolder="tokenizer"): tokenizer_dir = os.path.join(framework_model_dir, pipeline_info.name(), subfolder) if not os.path.exists(tokenizer_dir): @@ -1239,7 +1245,6 @@ def get_tokenizer(pipeline_info: PipelineInfo, framework_model_dir, hf_token, su pipeline_info.name(), subfolder=subfolder, use_safetensors=pipeline_info.is_xl(), - use_auth_token=hf_token, ) model.save_pretrained(tokenizer_dir) else: @@ -1266,8 +1271,8 @@ def __init__(self, pipeline_info: PipelineInfo, model, device, max_batch_size): max_batch_size=max_batch_size, ) - def load_model(self, framework_model_dir, hf_token, subfolder="vae_encoder"): - vae = self.from_pretrained(AutoencoderKL, framework_model_dir, hf_token, subfolder) + def load_model(self, framework_model_dir, subfolder="vae_encoder"): + vae = self.from_pretrained(AutoencoderKL, framework_model_dir, subfolder) return TorchVAEEncoder(vae) def get_input_names(self): diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py index ffa986f53304c..46a83f5dc228d 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py @@ -34,7 +34,6 @@ def __init__( pipeline_info: PipelineInfo, device="cuda", max_batch_size=16, - hf_token=None, use_cuda_graph=False, ): """ @@ -47,21 +46,18 @@ def __init__( device to run engine max_batch_size (int): Maximum batch size for dynamic batch engine. - hf_token (str): - HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. use_cuda_graph (bool): Use CUDA graph to capture engine execution and then launch inference """ self.engine_type = engine_type self.pipeline_info = pipeline_info self.max_batch_size = max_batch_size - self.hf_token = hf_token self.use_cuda_graph = use_cuda_graph self.device = torch.device(device) self.torch_device = torch.device(device, torch.cuda.current_device()) self.stages = pipeline_info.stages() - self.vae_torch_fallback = self.pipeline_info.vae_torch_fallback() + self.vae_torch_fallback = self.pipeline_info.vae_torch_fallback() and self.engine_type != EngineType.TORCH self.custom_fp16_vae = self.pipeline_info.custom_fp16_vae() self.models = {} @@ -87,24 +83,36 @@ def teardown(self): del engine self.engines = {} + def get_diffusers_module_name(self, model_name): + name_mapping = { + "clip": "text_encoder", + "clip2": "text_encoder_2", + "unet": "unet", + "unetxl": "unet", + "vae": "vae_decoder", + } + return name_mapping[model_name] if model_name in name_mapping else model_name + def get_cached_model_name(self, model_name): + model_name = self.get_diffusers_module_name(model_name) + is_unet = model_name == "unet" hash_source = [] - if model_name in ["clip", "clip2", "unet", "unetxl"] and self.pipeline_info.lora_weights: + if model_name in ["text_encoder", "text_encoder_2", "unet"] and self.pipeline_info.lora_weights: if self.pipeline_info.lora_weights in [ "latent-consistency/lcm-lora-sdxl", "latent-consistency/lcm-lora-sdv1-5", ]: - if model_name in ["unet", "unetxl"]: - model_name = model_name + "_lcm-lora" + if is_unet: + model_name = "unet_lcm-lora" else: model_name = model_name + "_lora" hash_source.append(self.pipeline_info.lora_weights) # TODO(tianleiwu): save custom model to a directory named by its original model. - if model_name == "unetxl" and self.pipeline_info.custom_unet(): + if is_unet and self.pipeline_info.custom_unet(): model_name = model_name + "_lcm" - if model_name in ["unet", "unetxl"] and self.pipeline_info.controlnet: + if model_name in ["unet"] and self.pipeline_info.controlnet: model_name = model_name + "_" + "_".join(self.pipeline_info.controlnet) if hash_source: @@ -118,8 +126,9 @@ def get_cached_model_name(self, model_name): def get_model_dir(self, model_name, root_dir, opt=True, suffix="", create=True): engine_name = self.engine_type.name.lower() - # TODO: Need not add engine name for ORT_CUDA - directory_name = self.get_cached_model_name(model_name) + (f".{engine_name}" if opt else "") + suffix + if engine_name != "ort_cuda" and not suffix: + suffix = f".{engine_name}" if opt else "" + directory_name = self.get_cached_model_name(model_name) + suffix onnx_model_dir = os.path.join(root_dir, directory_name) if create: os.makedirs(onnx_model_dir, exist_ok=True) @@ -160,14 +169,14 @@ def get_or_load_model(self, pipeline, model_name, model_obj, framework_model_dir model = pipeline.unet pipeline.unet = None else: - model = model_obj.load_model(framework_model_dir, self.hf_token) + model = model_obj.load_model(framework_model_dir) return model.to(self.torch_device) def load_models(self, framework_model_dir: str): - # For TRT or ORT_TRT, we will export fp16 torch model for UNet. + # For TRT or ORT_TRT, we will export fp16 torch model for UNet and VAE # For ORT_CUDA, we export fp32 model first, then optimize to fp16. - export_fp16_unet = self.engine_type in [EngineType.ORT_TRT, EngineType.TRT] + export_fp16 = self.engine_type in [EngineType.ORT_TRT, EngineType.TRT] if "clip" in self.stages: self.models["clip"] = CLIP( @@ -192,7 +201,7 @@ def load_models(self, framework_model_dir: str): self.pipeline_info, None, # not loaded yet device=self.torch_device, - fp16=export_fp16_unet, + fp16=export_fp16, max_batch_size=self.max_batch_size, unet_dim=(9 if self.pipeline_info.is_inpaint() else 4), ) @@ -202,7 +211,7 @@ def load_models(self, framework_model_dir: str): self.pipeline_info, None, # not loaded yet device=self.torch_device, - fp16=export_fp16_unet, + fp16=export_fp16, max_batch_size=self.max_batch_size, unet_dim=4, time_dim=(5 if self.pipeline_info.is_xl_refiner() else 6), @@ -215,13 +224,17 @@ def load_models(self, framework_model_dir: str): None, # not loaded yet device=self.torch_device, max_batch_size=self.max_batch_size, + fp16=export_fp16, custom_fp16_vae=self.custom_fp16_vae, ) if self.vae_torch_fallback: - self.torch_models["vae"] = self.models["vae"].load_model(framework_model_dir, self.hf_token) + self.torch_models["vae"] = self.models["vae"].load_model(framework_model_dir) def load_resources(self, image_height, image_width, batch_size): + if self.engine_type == EngineType.TORCH: + return + # Allocate buffers for I/O bindings for model_name, obj in self.models.items(): if model_name == "vae" and self.vae_torch_fallback: @@ -232,13 +245,22 @@ def load_resources(self, image_height, image_width, batch_size): ) def _vae_decode(self, latents): - if self.vae_torch_fallback: + if self.engine_type == EngineType.TORCH: + if self.pipeline_info.is_xl() and not self.custom_fp16_vae: # need upcast + latents = latents.to(dtype=torch.float32) + images = self.engines["vae"](latents)["sample"] + else: + images = self.engines["vae"](latents)["sample"] + elif self.vae_torch_fallback: if not self.custom_fp16_vae: latents = latents.to(dtype=torch.float32) self.torch_models["vae"] = self.torch_models["vae"].to(dtype=torch.float32) images = self.torch_models["vae"](latents)["sample"] else: - images = self.run_engine("vae", {"latent": latents})["images"] + if self.pipeline_info.is_xl() and not self.custom_fp16_vae: # need upcast + images = self.run_engine("vae", {"latent": latents.to(dtype=torch.float32)})["images"] + else: + images = self.run_engine("vae", {"latent": latents})["images"] return images diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py index 2ac9a45577676..6ab4858f11f23 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py @@ -6,15 +6,17 @@ import gc import logging import os -import shutil from typing import List, Optional +import onnx import torch from diffusion_models import PipelineInfo from engine_builder import EngineBuilder, EngineType -from ort_utils import CudaSession +from packaging import version import onnxruntime as ort +from onnxruntime.transformers.io_binding_helper import CudaSession +from onnxruntime.transformers.onnx_model import OnnxModel logger = logging.getLogger(__name__) @@ -83,7 +85,6 @@ def __init__( self, pipeline_info: PipelineInfo, max_batch_size=16, - hf_token=None, device="cuda", use_cuda_graph=False, ): @@ -95,8 +96,6 @@ def __init__( Version and Type of pipeline. max_batch_size (int): Maximum batch size for dynamic batch engine. - hf_token (str): - HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. device (str): device to run. use_cuda_graph (bool): @@ -106,7 +105,6 @@ def __init__( EngineType.ORT_CUDA, pipeline_info, max_batch_size=max_batch_size, - hf_token=hf_token, device=device, use_cuda_graph=use_cuda_graph, ) @@ -153,6 +151,65 @@ def configure_xl(self, onnx_opset_version: int): use_cuda_graph=self.use_cuda_graph, ) + def optimized_onnx_path(self, engine_dir, model_name): + suffix = "" if self.model_config[model_name].fp16 else ".fp32" + return self.get_onnx_path(model_name, engine_dir, opt=True, suffix=suffix) + + def import_diffusers_engine(self, diffusers_onnx_dir: str, engine_dir: str): + """Import optimized onnx models for diffusers from Olive or optimize_pipeline tools. + + Args: + diffusers_onnx_dir (str): optimized onnx directory of Olive + engine_dir (str): the directory to store imported onnx + """ + if version.parse(ort.__version__) < version.parse("1.17.0"): + print("Skip importing since onnxruntime-gpu version < 1.17.0.") + return + + for model_name, model_obj in self.models.items(): + onnx_import_path = self.optimized_onnx_path(diffusers_onnx_dir, model_name) + if not os.path.exists(onnx_import_path): + print(f"{onnx_import_path} not existed. Skip importing.") + continue + + onnx_opt_path = self.optimized_onnx_path(engine_dir, model_name) + if os.path.exists(onnx_opt_path): + print(f"{onnx_opt_path} existed. Skip importing.") + continue + + if model_name == "vae" and self.pipeline_info.is_xl(): + print(f"Skip importing VAE since it is not fully compatible with float16: {onnx_import_path}.") + continue + + model = OnnxModel(onnx.load(onnx_import_path, load_external_data=True)) + + if model_name in ["clip", "clip2"]: + hidden_states_per_layer = [] + for output in model.graph().output: + if output.name.startswith("hidden_states."): + hidden_states_per_layer.append(output.name) + if hidden_states_per_layer: + kept_hidden_states = hidden_states_per_layer[-2 - model_obj.clip_skip] + model.rename_graph_output(kept_hidden_states, "hidden_states") + + model.rename_graph_output( + "last_hidden_state" if model_name == "clip" else "text_embeds", "text_embeddings" + ) + model.prune_graph( + ["text_embeddings", "hidden_states"] if hidden_states_per_layer else ["text_embeddings"] + ) + + if model_name == "clip2": + model.change_graph_input_type(model.find_graph_input("input_ids"), onnx.TensorProto.INT32) + + model.save_model_to_file(onnx_opt_path, use_external_data_format=(model_name == "clip2")) + elif model_name in ["unet", "unetxl"]: + model.rename_graph_output("out_sample", "latent") + model.save_model_to_file(onnx_opt_path, use_external_data_format=True) + + del model + continue + def build_engines( self, engine_dir: str, @@ -160,21 +217,13 @@ def build_engines( onnx_dir: str, tmp_dir: Optional[str] = None, onnx_opset_version: int = 17, - force_engine_rebuild: bool = False, device_id: int = 0, - save_fp32_intermediate_model=False, + save_fp32_intermediate_model: bool = False, + import_engine_dir: Optional[str] = None, ): self.torch_device = torch.device("cuda", device_id) self.load_models(framework_model_dir) - if force_engine_rebuild: - if os.path.isdir(onnx_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", onnx_dir) - shutil.rmtree(onnx_dir) - if os.path.isdir(engine_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", engine_dir) - shutil.rmtree(engine_dir) - if not os.path.isdir(engine_dir): os.makedirs(engine_dir) @@ -188,6 +237,13 @@ def build_engines( if model_name not in self.model_config: self.model_config[model_name] = _ModelConfig(onnx_opset_version, self.use_cuda_graph) + # Import Engine + if import_engine_dir: + if self.pipeline_info.is_xl(): + self.import_diffusers_engine(import_engine_dir, engine_dir) + else: + print(f"Only support importing SDXL onnx. Ignore --engine-dir {import_engine_dir}") + # Load lora only when we need export text encoder or UNet to ONNX. load_lora = False if self.pipeline_info.lora_weights: @@ -195,9 +251,7 @@ def build_engines( if model_name not in ["clip", "clip2", "unet", "unetxl"]: continue onnx_path = self.get_onnx_path(model_name, onnx_dir, opt=False) - - suffix = ".fp16" if self.model_config[model_name].fp16 else ".fp32" - onnx_opt_path = self.get_onnx_path(model_name, engine_dir, opt=True, suffix=suffix) + onnx_opt_path = self.optimized_onnx_path(engine_dir, model_name) if not os.path.exists(onnx_opt_path): if not os.path.exists(onnx_path): load_lora = True @@ -212,8 +266,7 @@ def build_engines( continue onnx_path = self.get_onnx_path(model_name, onnx_dir, opt=False) - suffix = ".fp16" if self.model_config[model_name].fp16 else ".fp32" - onnx_opt_path = self.get_onnx_path(model_name, engine_dir, opt=True, suffix=suffix) + onnx_opt_path = self.optimized_onnx_path(engine_dir, model_name) if not os.path.exists(onnx_opt_path): if not os.path.exists(onnx_path): print("----") @@ -280,7 +333,7 @@ def build_engines( fp32_op_list=self.model_config[model_name].force_fp32_ops, optimize_by_ort=optimize_by_ort, optimize_by_fusion=not use_fp32_intermediate, - tmp_dir=self.get_model_dir(model_name, tmp_dir, opt=False, suffix=".fp16", create=False), + tmp_dir=self.get_model_dir(model_name, tmp_dir, opt=False, suffix=".ort", create=False), ) else: logger.info("Found cached optimized model: %s", onnx_opt_path) @@ -291,9 +344,7 @@ def build_engines( if model_name == "vae" and self.vae_torch_fallback: continue - suffix = ".fp16" if self.model_config[model_name].fp16 else ".fp32" - onnx_opt_path = self.get_onnx_path(model_name, engine_dir, opt=True, suffix=suffix) - + onnx_opt_path = self.optimized_onnx_path(engine_dir, model_name) use_cuda_graph = self.model_config[model_name].use_cuda_graph engine = OrtCudaEngine( @@ -308,7 +359,5 @@ def build_engines( self.engines = built_engines - return built_engines - def run_engine(self, model_name, feed_dict): return self.engines[model_name].infer(feed_dict) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py index 8c637007b840d..a606b88c82245 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py @@ -6,16 +6,15 @@ import gc import logging import os -import shutil import torch from cuda import cudart from diffusion_models import PipelineInfo from engine_builder import EngineBuilder, EngineType -from ort_utils import CudaSession from packaging import version import onnxruntime as ort +from onnxruntime.transformers.io_binding_helper import CudaSession logger = logging.getLogger(__name__) @@ -110,7 +109,6 @@ def __init__( self, pipeline_info: PipelineInfo, max_batch_size=16, - hf_token=None, device="cuda", use_cuda_graph=False, ): @@ -122,8 +120,6 @@ def __init__( Version and Type of pipeline. max_batch_size (int): Maximum batch size for dynamic batch engine. - hf_token (str): - HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. device (str): device to run. use_cuda_graph (bool): @@ -133,7 +129,6 @@ def __init__( EngineType.ORT_TRT, pipeline_info, max_batch_size=max_batch_size, - hf_token=hf_token, device=device, use_cuda_graph=use_cuda_graph, ) @@ -165,7 +160,6 @@ def build_engines( opt_image_height, opt_image_width, opt_batch_size=1, - force_engine_rebuild=False, static_batch=False, static_image_shape=True, max_workspace_size=0, @@ -175,14 +169,6 @@ def build_engines( self.torch_device = torch.device("cuda", device_id) self.load_models(framework_model_dir) - if force_engine_rebuild: - if os.path.isdir(onnx_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", onnx_dir) - shutil.rmtree(onnx_dir) - if os.path.isdir(engine_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", engine_dir) - shutil.rmtree(engine_dir) - if not os.path.isdir(engine_dir): os.makedirs(engine_dir) @@ -298,7 +284,5 @@ def build_engines( self.engines = built_engines - return built_engines - def run_engine(self, model_name, feed_dict): return self.engines[model_name].infer(feed_dict) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_tensorrt.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_tensorrt.py index bac1a8bb8140d..438145fc2c57a 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_tensorrt.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_tensorrt.py @@ -26,8 +26,6 @@ from collections import OrderedDict import numpy as np -import onnx -import onnx_graphsurgeon as gs import tensorrt as trt import torch from cuda import cudart @@ -43,7 +41,6 @@ network_from_onnx_path, save_engine, ) -from trt_utilities import TRT_LOGGER # Map of numpy dtype -> torch dtype numpy_to_torch_dtype_dict = { @@ -83,115 +80,11 @@ def __del__(self): del self.buffers del self.tensors - def refit(self, onnx_path, onnx_refit_path): - def convert_int64(arr): - if len(arr.shape) == 0: - return np.int32(arr) - return arr - - def add_to_map(refit_dict, name, values): - if name in refit_dict: - assert refit_dict[name] is None - if values.dtype == np.int64: - values = convert_int64(values) - refit_dict[name] = values - - print(f"Refitting TensorRT engine with {onnx_refit_path} weights") - refit_nodes = gs.import_onnx(onnx.load(onnx_refit_path)).toposort().nodes - - # Construct mapping from weight names in refit model -> original model - name_map = {} - for n, node in enumerate(gs.import_onnx(onnx.load(onnx_path)).toposort().nodes): - refit_node = refit_nodes[n] - assert node.op == refit_node.op - # Constant nodes in ONNX do not have inputs but have a constant output - if node.op == "Constant": - name_map[refit_node.outputs[0].name] = node.outputs[0].name - # Handle scale and bias weights - elif node.op == "Conv": - if node.inputs[1].__class__ == gs.Constant: - name_map[refit_node.name + "_TRTKERNEL"] = node.name + "_TRTKERNEL" - if node.inputs[2].__class__ == gs.Constant: - name_map[refit_node.name + "_TRTBIAS"] = node.name + "_TRTBIAS" - # For all other nodes: find node inputs that are initializers (gs.Constant) - else: - for i, inp in enumerate(node.inputs): - if inp.__class__ == gs.Constant: - name_map[refit_node.inputs[i].name] = inp.name - - def map_name(name): - if name in name_map: - return name_map[name] - return name - - # Construct refit dictionary - refit_dict = {} - refitter = trt.Refitter(self.engine, TRT_LOGGER) - all_weights = refitter.get_all() - for layer_name, role in zip(all_weights[0], all_weights[1]): - # for specialized roles, use a unique name in the map: - if role == trt.WeightsRole.KERNEL: - name = layer_name + "_TRTKERNEL" - elif role == trt.WeightsRole.BIAS: - name = layer_name + "_TRTBIAS" - else: - name = layer_name - - assert name not in refit_dict, "Found duplicate layer: " + name - refit_dict[name] = None - - for n in refit_nodes: - # Constant nodes in ONNX do not have inputs but have a constant output - if n.op == "Constant": - name = map_name(n.outputs[0].name) - print(f"Add Constant {name}\n") - add_to_map(refit_dict, name, n.outputs[0].values) - - # Handle scale and bias weights - elif n.op == "Conv": - if n.inputs[1].__class__ == gs.Constant: - name = map_name(n.name + "_TRTKERNEL") - add_to_map(refit_dict, name, n.inputs[1].values) - - if n.inputs[2].__class__ == gs.Constant: - name = map_name(n.name + "_TRTBIAS") - add_to_map(refit_dict, name, n.inputs[2].values) - - # For all other nodes: find node inputs that are initializers (AKA gs.Constant) - else: - for inp in n.inputs: - name = map_name(inp.name) - if inp.__class__ == gs.Constant: - add_to_map(refit_dict, name, inp.values) - - for layer_name, weights_role in zip(all_weights[0], all_weights[1]): - if weights_role == trt.WeightsRole.KERNEL: - custom_name = layer_name + "_TRTKERNEL" - elif weights_role == trt.WeightsRole.BIAS: - custom_name = layer_name + "_TRTBIAS" - else: - custom_name = layer_name - - # Skip refitting Trilu for now; scalar weights of type int64 value 1 - for clip model - if layer_name.startswith("onnx::Trilu"): - continue - - if refit_dict[custom_name] is not None: - refitter.set_weights(layer_name, weights_role, refit_dict[custom_name]) - else: - print(f"[W] No refit weights for layer: {layer_name}") - - if not refitter.refit_cuda_engine(): - print("Failed to refit!") - exit(0) - def build( self, onnx_path, fp16, input_profile=None, - enable_refit=False, - enable_preview=False, enable_all_tactics=False, timing_cache=None, update_output_names=None, @@ -214,7 +107,7 @@ def build( engine = engine_from_network( network, config=CreateConfig( - fp16=fp16, refittable=enable_refit, profiles=[p], load_timing_cache=timing_cache, **config_kwargs + fp16=fp16, refittable=False, profiles=[p], load_timing_cache=timing_cache, **config_kwargs ), save_timing_cache=timing_cache, ) @@ -294,7 +187,6 @@ def __init__( self, pipeline_info: PipelineInfo, max_batch_size=16, - hf_token=None, device="cuda", use_cuda_graph=False, ): @@ -306,8 +198,6 @@ def __init__( Version and Type of pipeline. max_batch_size (int): Maximum batch size for dynamic batch engine. - hf_token (str): - HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. device (str): device to run. use_cuda_graph (bool): @@ -317,7 +207,6 @@ def __init__( EngineType.TRT, pipeline_info, max_batch_size=max_batch_size, - hf_token=hf_token, device=device, use_cuda_graph=use_cuda_graph, ) @@ -348,16 +237,10 @@ def load_engines( opt_batch_size, opt_image_height, opt_image_width, - force_export=False, - force_optimize=False, - force_build=False, static_batch=False, static_shape=True, - enable_refit=False, - enable_preview=False, enable_all_tactics=False, timing_cache=None, - onnx_refit_dir=None, ): """ Build and load engines for TensorRT accelerated inference. @@ -378,26 +261,14 @@ def load_engines( Image height to optimize for during engine building. Must be a multiple of 8. opt_image_width (int): Image width to optimize for during engine building. Must be a multiple of 8. - force_export (bool): - Force re-exporting the ONNX models. - force_optimize (bool): - Force re-optimizing the ONNX models. - force_build (bool): - Force re-building the TensorRT engine. static_batch (bool): Build engine only for specified opt_batch_size. static_shape (bool): Build engine only for specified opt_image_height & opt_image_width. Default = True. - enable_refit (bool): - Build engines with refit option enabled. - enable_preview (bool): - Enable TensorRT preview features. enable_all_tactics (bool): Enable all tactic sources during TensorRT engine builds. timing_cache (str): Path to the timing cache to accelerate build or None - onnx_refit_dir (str): - Directory containing refit ONNX models. """ # Create directory for directory in [engine_dir, onnx_dir]: @@ -417,11 +288,11 @@ def load_engines( opt_batch_size, opt_image_height, opt_image_width, static_batch, static_shape ) engine_path = self.get_engine_path(engine_dir, model_name, profile_id) - if force_export or force_build or not os.path.exists(engine_path): + if not os.path.exists(engine_path): onnx_path = self.get_onnx_path(model_name, onnx_dir, opt=False) onnx_opt_path = self.get_onnx_path(model_name, onnx_dir, opt=True) - if force_export or not os.path.exists(onnx_opt_path): - if force_export or not os.path.exists(onnx_path): + if not os.path.exists(onnx_opt_path): + if not os.path.exists(onnx_path): load_lora = True break @@ -436,11 +307,11 @@ def load_engines( opt_batch_size, opt_image_height, opt_image_width, static_batch, static_shape ) engine_path = self.get_engine_path(engine_dir, model_name, profile_id) - if force_export or force_build or not os.path.exists(engine_path): + if not os.path.exists(engine_path): onnx_path = self.get_onnx_path(model_name, onnx_dir, opt=False) onnx_opt_path = self.get_onnx_path(model_name, onnx_dir, opt=True) - if force_export or not os.path.exists(onnx_opt_path): - if force_export or not os.path.exists(onnx_path): + if not os.path.exists(onnx_opt_path): + if not os.path.exists(onnx_path): print(f"Exporting model: {onnx_path}") model = self.get_or_load_model(pipe, model_name, model_obj, framework_model_dir) @@ -464,7 +335,7 @@ def load_engines( print(f"Found cached model: {onnx_path}") # Optimize onnx - if force_optimize or not os.path.exists(onnx_opt_path): + if not os.path.exists(onnx_opt_path): print(f"Generating optimizing model: {onnx_opt_path}") model_obj.optimize_trt(onnx_path, onnx_opt_path) else: @@ -482,7 +353,7 @@ def load_engines( engine = TensorrtEngine(engine_path) onnx_opt_path = self.get_onnx_path(model_name, onnx_dir, opt=True) - if force_build or not os.path.exists(engine.engine_path): + if not os.path.exists(engine.engine_path): engine.build( onnx_opt_path, fp16=True, @@ -493,8 +364,6 @@ def load_engines( static_batch, static_shape, ), - enable_refit=enable_refit, - enable_preview=enable_preview, enable_all_tactics=enable_all_tactics, timing_cache=timing_cache, update_output_names=None, @@ -506,10 +375,6 @@ def load_engines( if model_name == "vae" and self.vae_torch_fallback: continue self.engines[model_name].load() - if onnx_refit_dir: - onnx_refit_path = self.get_onnx_path(model_name, onnx_refit_dir, opt=True) - if os.path.exists(onnx_refit_path): - self.engines[model_name].refit(onnx_opt_path, onnx_refit_path) def max_device_memory(self): max_device_memory = 0 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_torch.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_torch.py new file mode 100644 index 0000000000000..84b9ec27b801e --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_torch.py @@ -0,0 +1,108 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import logging + +from diffusion_models import PipelineInfo +from engine_builder import EngineBuilder, EngineType + +logger = logging.getLogger(__name__) + + +class TorchEngineBuilder(EngineBuilder): + def __init__( + self, + pipeline_info: PipelineInfo, + max_batch_size=16, + device="cuda", + use_cuda_graph=False, + ): + """ + Initializes the ONNX Runtime TensorRT ExecutionProvider Engine Builder. + + Args: + pipeline_info (PipelineInfo): + Version and Type of pipeline. + max_batch_size (int): + Maximum batch size for dynamic batch engine. + device (str): + device to run. + use_cuda_graph (bool): + Use CUDA graph to capture engine execution and then launch inference + """ + super().__init__( + EngineType.TORCH, + pipeline_info, + max_batch_size=max_batch_size, + device=device, + use_cuda_graph=use_cuda_graph, + ) + + self.compile_config = {} + if use_cuda_graph: + self.compile_config = { + "clip": {"mode": "reduce-overhead", "dynamic": False}, + "clip2": {"mode": "reduce-overhead", "dynamic": False}, + "unet": {"mode": "reduce-overhead", "fullgraph": True, "dynamic": False}, + "unetxl": {"mode": "reduce-overhead", "fullgraph": True, "dynamic": False}, + "vae": {"mode": "reduce-overhead", "fullgraph": False, "dynamic": False}, + } + + def build_engines( + self, + framework_model_dir: str, + ): + import torch + + self.torch_device = torch.device("cuda", torch.cuda.current_device()) + self.load_models(framework_model_dir) + + pipe = self.load_pipeline_with_lora() if self.pipeline_info.lora_weights else None + + built_engines = {} + for model_name, model_obj in self.models.items(): + model = self.get_or_load_model(pipe, model_name, model_obj, framework_model_dir) + if self.pipeline_info.is_xl() and not self.custom_fp16_vae: + model = model.to(device=self.torch_device, dtype=torch.float32) + else: + model = model.to(device=self.torch_device, dtype=torch.float16) + + if model_name in self.compile_config: + compile_config = self.compile_config[model_name] + if model_name in ["unet", "unetxl"]: + model.to(memory_format=torch.channels_last) + engine = torch.compile(model, **compile_config) + built_engines[model_name] = engine + else: # eager mode + built_engines[model_name] = model + + self.engines = built_engines + + def run_engine(self, model_name, feed_dict): + if model_name in ["unet", "unetxl"]: + if "controlnet_images" in feed_dict: + return {"latent": self.engines[model_name](**feed_dict)} + + if model_name == "unetxl": + added_cond_kwargs = {k: feed_dict[k] for k in feed_dict if k in ["text_embeds", "time_ids"]} + return { + "latent": self.engines[model_name]( + feed_dict["sample"], + feed_dict["timestep"], + feed_dict["encoder_hidden_states"], + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + } + + return { + "latent": self.engines[model_name]( + feed_dict["sample"], feed_dict["timestep"], feed_dict["encoder_hidden_states"], return_dict=False + )[0] + } + + if model_name in ["vae_encoder"]: + return {"latent": self.engines[model_name](feed_dict["images"])} + + raise RuntimeError(f"Shall not reach here: {model_name}") diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py deleted file mode 100644 index 37785869a355b..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py +++ /dev/null @@ -1,292 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -# -# Copyright 2023 The HuggingFace Inc. team. -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Stable diffusion text to image pipeline using ONNX Runtime CUDA execution provider. -Based on https://github.com/huggingface/diffusers/blob/v0.17.1/examples/community/stable_diffusion_tensorrt_txt2img.py -Modifications: (1) Create ONNX Runtime session (2) Use I/O Binding of ONNX Runtime for inference - -Installation instructions -pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 -pip install --upgrade transformers diffusers>=0.16.0 -pip install numpy>=1.24.1 onnx>=1.13.0 coloredlogs protobuf==3.20.3 psutil sympy -pip install onnxruntime-gpu -""" - -import logging -import os -from typing import List, Optional, Union - -import torch -from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipelines.stable_diffusion import ( - StableDiffusionPipeline, - StableDiffusionPipelineOutput, - StableDiffusionSafetyChecker, -) -from diffusers.schedulers import DDIMScheduler -from diffusion_models import CLIP, VAE, PipelineInfo, UNet -from ort_utils import Engines, StableDiffusionPipelineMixin -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer - -logger = logging.getLogger(__name__) - - -class OnnxruntimeCudaStableDiffusionPipeline(StableDiffusionPipelineMixin, StableDiffusionPipeline): - r""" - Pipeline for text-to-image generation using CUDA provider in ONNX Runtime. - This pipeline inherits from [`StableDiffusionPipeline`]. Check the documentation in super class for most parameters. - """ - - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: DDIMScheduler, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool = True, - # ONNX export parameters - onnx_opset: int = 14, - onnx_dir: str = "onnx_ort", - # Onnxruntime execution provider parameters - engine_dir: str = "ORT_CUDA", - force_engine_rebuild: bool = False, - enable_cuda_graph: bool = False, - pipeline_info: PipelineInfo = None, - ): - super().__init__( - vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker - ) - - self.vae.forward = self.vae.decode - self.unet_in_channels = unet.config.in_channels - - self.inpaint = False - self.onnx_dir = onnx_dir - self.engine_dir = engine_dir - self.force_engine_rebuild = force_engine_rebuild - self.enable_cuda_graph = enable_cuda_graph - - self.max_batch_size = 16 - - self.models = {} # loaded in __load_models() - self.engines = Engines("CUDAExecutionProvider", onnx_opset) - - self.fp16 = False - - self.pipeline_info = pipeline_info - - def load_models(self): - assert self.pipeline_info.clip_embedding_dim() == self.text_encoder.config.hidden_size - - stages = self.pipeline_info.stages() - if "clip" in stages: - self.models["clip"] = CLIP( - self.pipeline_info, - self.text_encoder, - device=self.torch_device, - max_batch_size=self.max_batch_size, - clip_skip=0, - ) - - if "unet" in stages: - self.models["unet"] = UNet( - self.pipeline_info, - self.unet, - device=self.torch_device, - fp16=False, - max_batch_size=self.max_batch_size, - unet_dim=(9 if self.pipeline_info.is_inpaint() else 4), - ) - - if "vae" in stages: - self.models["vae"] = VAE( - self.pipeline_info, - self.vae, - device=self.torch_device, - max_batch_size=self.max_batch_size, - ) - - def to( - self, - torch_device: Union[str, torch.device], - torch_dtype: Optional[torch.dtype] = None, - silence_dtype_warnings: bool = False, - ): - self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir) - self.engine_dir = os.path.join(self.cached_folder, self.engine_dir) - - # set device - self.torch_device = torch.device(torch_device) - - # load models - self.fp16 = torch_dtype == torch.float16 - self.load_models() - - # build engines - self.engines.build( - self.models, - self.engine_dir, - self.onnx_dir, - force_engine_rebuild=self.force_engine_rebuild, - fp16=self.fp16, - device_id=self.torch_device.index or torch.cuda.current_device(), - enable_cuda_graph=self.enable_cuda_graph, - ) - - # Load the remaining modules to GPU. - self.text_encoder = None - self.vae = None - self.unet = None - super().to(torch_device, torch_dtype, silence_dtype_warnings=silence_dtype_warnings) - - self.torch_device = self._execution_device - logger.info(f"Running inference on device: {self.torch_device}") - - return self - - def __allocate_buffers(self, image_height, image_width, batch_size): - # Allocate output tensors for I/O bindings - for model_name, obj in self.models.items(): - self.engines.get_engine(model_name).allocate_buffers( - obj.get_shape_dict(batch_size, image_height, image_width) - ) - - @torch.no_grad() - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - image_height: int = 512, - image_width: int = 512, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. - Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - """ - self.generator = generator - self.denoising_steps = num_inference_steps - self.guidance_scale = guidance_scale - - # Pre-compute latent input scales and linear multistep coefficients - self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device) - - # Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - prompt = [prompt] - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}") - - if negative_prompt is None: - negative_prompt = [""] * batch_size - - if negative_prompt is not None and isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - - assert len(prompt) == len(negative_prompt) - - if batch_size > self.max_batch_size: - raise ValueError( - f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4" - ) - - self.__allocate_buffers(image_height, image_width, batch_size) - - with torch.inference_mode(), torch.autocast("cuda"): - # CLIP text encoder - text_embeddings = self.encode_prompt(self.engines.get_engine("clip"), prompt, negative_prompt) - - # Pre-initialize latents - num_channels_latents = self.unet_in_channels - latents = self.prepare_latents( - batch_size, - num_channels_latents, - image_height, - image_width, - torch.float16 if self.fp16 else torch.float32, - self.torch_device, - generator, - ) - - # UNet denoiser - latents = self.denoise_latent( - self.engines.get_engine("unet"), latents, text_embeddings, timestep_fp16=self.fp16 - ) - - # VAE decode latent - images = self.decode_latent(self.engines.get_engine("vae"), latents) - - images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype) - images = self.numpy_to_pil(images) - return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept) - - -def example(): - pipeline_info = PipelineInfo("1.5") - model_name_or_path = pipeline_info.name() - scheduler = DDIMScheduler.from_pretrained(model_name_or_path, subfolder="scheduler") - pipe = OnnxruntimeCudaStableDiffusionPipeline.from_pretrained( - model_name_or_path, - scheduler=scheduler, - pipeline_info=pipeline_info, - ) - - # re-use cached folder to save ONNX models - pipe.set_cached_folder(model_name_or_path, resume_download=True, local_files_only=True) - - pipe = pipe.to("cuda", torch_dtype=torch.float16) - - prompt = "photorealistic new zealand hills" - image = pipe(prompt).images[0] - image.save("ort_cuda_txt2img_new_zealand_hills.png") - - -if __name__ == "__main__": - example() diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py deleted file mode 100644 index c663e37c7ea7d..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py +++ /dev/null @@ -1,261 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -# -# Copyright 2023 The HuggingFace Inc. team. -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Stable diffusion text to image pipeline using ONNX Runtime TensorRT execution provider. -Based on https://github.com/huggingface/diffusers/blob/v0.17.1/examples/community/stable_diffusion_tensorrt_txt2img.py -Modifications: (1) Create ONNX Runtime session (2) Use I/O Binding of ONNX Runtime for inference - -Installation instructions -pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 -pip install --upgrade transformers diffusers>=0.16.0 -pip install --upgrade tensorrt>=8.6.1 -pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com -pip install onnxruntime-gpu -""" - -import logging -import os -from typing import List, Optional, Union - -import torch -from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipelines.stable_diffusion import ( - StableDiffusionPipeline, - StableDiffusionPipelineOutput, - StableDiffusionSafetyChecker, -) -from diffusers.schedulers import DDIMScheduler -from diffusion_models import PipelineInfo -from engine_builder_ort_trt import OrtTensorrtEngineBuilder -from ort_utils import StableDiffusionPipelineMixin -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer - -logger = logging.getLogger(__name__) - - -class OnnxruntimeTensorRTStableDiffusionPipeline(StableDiffusionPipelineMixin, StableDiffusionPipeline): - r""" - Pipeline for text-to-image generation using TensorRT execution provider in ONNX Runtime. - - This pipeline inherits from [`StableDiffusionPipeline`]. Check the documentation in super class for most parameters. - """ - - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: DDIMScheduler, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool = True, - image_height: int = 768, - image_width: int = 768, - max_batch_size: int = 16, - # ONNX export parameters - onnx_opset: int = 17, - onnx_dir: str = "onnx_trt", - # TensorRT engine build parameters - engine_dir: str = "ORT_TRT", # use short name here to avoid path exceeds 260 chars in Windows. - force_engine_rebuild: bool = False, - enable_cuda_graph: bool = False, - pipeline_info: Optional[PipelineInfo] = None, - ): - super().__init__( - vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker - ) - - self.vae.forward = self.vae.decode - - self.image_height = image_height - self.image_width = image_width - self.onnx_opset = onnx_opset - self.onnx_dir = onnx_dir - self.engine_dir = engine_dir - self.force_engine_rebuild = force_engine_rebuild - - # Although cuda graph requires static input shape, engine built with dynamic batch gets better performance in T4. - # Use static batch could reduce GPU memory footprint. - self.build_static_batch = enable_cuda_graph - - # TODO: support dynamic image shape. - self.build_dynamic_shape = False - - self.max_batch_size = max_batch_size - # Restrict batch size to 4 for larger image dimensions as a walkaround for TensorRT limitation. - if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512: - self.max_batch_size = 4 - - self.engines = {} # loaded in build_engines() - self.engine_builder = OrtTensorrtEngineBuilder( - pipeline_info, max_batch_size=max_batch_size, use_cuda_graph=enable_cuda_graph - ) - - self.pipeline_info = pipeline_info - self.stages = pipeline_info.stages() - - def to( - self, - torch_device: Optional[Union[str, torch.device]] = None, - silence_dtype_warnings: bool = False, - ): - super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings) - - self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir) - self.engine_dir = os.path.join(self.cached_folder, self.engine_dir) - - # set device - self.torch_device = self._execution_device - logger.info(f"Running inference on device: {self.torch_device}") - - self.engines = self.engine_builder.build_engines( - self.engine_dir, - None, - self.onnx_dir, - self.onnx_opset, - opt_image_height=self.image_height, - opt_image_width=self.image_width, - force_engine_rebuild=self.force_engine_rebuild, - static_batch=self.build_static_batch, - static_image_shape=not self.build_dynamic_shape, - device_id=self.torch_device.index, - ) - - return self - - @torch.no_grad() - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. - Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - - """ - self.generator = generator - self.denoising_steps = num_inference_steps - self.guidance_scale = guidance_scale - - # Pre-compute latent input scales and linear multistep coefficients - self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device) - - # Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - prompt = [prompt] - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}") - - if negative_prompt is None: - negative_prompt = [""] * batch_size - - if negative_prompt is not None and isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - - assert len(prompt) == len(negative_prompt) - - if batch_size > self.max_batch_size: - raise ValueError( - f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4" - ) - - self.engine_builder.load_resources(self.image_height, self.image_width, batch_size) - - with torch.inference_mode(), torch.autocast("cuda"): - # CLIP text encoder - text_embeddings = self.encode_prompt(self.engines["clip"], prompt, negative_prompt) - - # Pre-initialize latents - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size, - num_channels_latents, - self.image_height, - self.image_width, - torch.float32, - self.torch_device, - generator, - ) - - # UNet denoiser - latents = self.denoise_latent(self.engines["unet"], latents, text_embeddings) - - # VAE decode latent - images = self.decode_latent(self.engines["vae"], latents) - - images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype) - images = self.numpy_to_pil(images) - return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept) - - -if __name__ == "__main__": - pipeline_info = PipelineInfo("1.5") - model_name_or_path = pipeline_info.name() - scheduler = DDIMScheduler.from_pretrained(model_name_or_path, subfolder="scheduler") - - pipe = OnnxruntimeTensorRTStableDiffusionPipeline.from_pretrained( - model_name_or_path, - revision="fp16", - torch_dtype=torch.float16, - scheduler=scheduler, - image_height=512, - image_width=512, - max_batch_size=4, - pipeline_info=pipeline_info, - ) - - # re-use cached folder to save ONNX models and TensorRT Engines - pipe.set_cached_folder(model_name_or_path, revision="fp16") - - pipe = pipe.to("cuda") - - prompt = "photorealistic new zealand hills" - image = pipe(prompt).images[0] - image.save("ort_trt_txt2img_new_zealand_hills.png") diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/ort_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/ort_utils.py deleted file mode 100644 index 0afa13a0f4dca..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/ort_utils.py +++ /dev/null @@ -1,262 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -import gc -import logging -import os -import shutil -import sys -from typing import Union - -import torch - -import onnxruntime as ort - -logger = logging.getLogger(__name__) - - -def add_transformers_dir_to_path(): - sys.path.append(os.path.dirname(__file__)) - - transformers_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) - if transformers_dir not in sys.path: - sys.path.append(transformers_dir) - - -add_transformers_dir_to_path() -from io_binding_helper import CudaSession # noqa: E402. Walk-around to test locally - - -# ----------------------------------------------------------------------------------------------------- -# Utilities for CUDA EP -# ----------------------------------------------------------------------------------------------------- -class Engine(CudaSession): - def __init__(self, engine_path, provider: str, device_id: int = 0, enable_cuda_graph=False): - self.engine_path = engine_path - self.provider = provider - self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph) - - device = torch.device("cuda", device_id) - ort_session = ort.InferenceSession( - self.engine_path, - providers=[ - (provider, self.provider_options), - "CPUExecutionProvider", - ], - ) - - super().__init__(ort_session, device, enable_cuda_graph) - - -class Engines: - def __init__(self, provider, onnx_opset: int = 14): - self.provider = provider - self.engines = {} - self.onnx_opset = onnx_opset - - @staticmethod - def get_onnx_path(onnx_dir, model_name): - return os.path.join(onnx_dir, model_name + ".onnx") - - @staticmethod - def get_engine_path(engine_dir, model_name, profile_id): - return os.path.join(engine_dir, model_name + profile_id + ".onnx") - - def build( - self, - models, - engine_dir: str, - onnx_dir: str, - force_engine_rebuild: bool = False, - fp16: bool = True, - device_id: int = 0, - enable_cuda_graph: bool = False, - ): - profile_id = "_fp16" if fp16 else "_fp32" - - if force_engine_rebuild: - if os.path.isdir(onnx_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", onnx_dir) - shutil.rmtree(onnx_dir) - if os.path.isdir(engine_dir): - logger.info("Remove existing directory %s since force_engine_rebuild is enabled", engine_dir) - shutil.rmtree(engine_dir) - - if not os.path.isdir(engine_dir): - os.makedirs(engine_dir) - - if not os.path.isdir(onnx_dir): - os.makedirs(onnx_dir) - - # Export models to ONNX - for model_name, model_obj in models.items(): - onnx_path = Engines.get_onnx_path(onnx_dir, model_name) - onnx_opt_path = Engines.get_engine_path(engine_dir, model_name, profile_id) - if os.path.exists(onnx_opt_path): - logger.info("Found cached optimized model: %s", onnx_opt_path) - else: - if os.path.exists(onnx_path): - logger.info("Found cached model: %s", onnx_path) - else: - logger.info("Exporting model: %s", onnx_path) - model = model_obj.get_model().to(model_obj.device) - with torch.inference_mode(): - inputs = model_obj.get_sample_input(1, 512, 512) - fp32_inputs = tuple( - [ - (tensor.to(torch.float32) if tensor.dtype == torch.float16 else tensor) - for tensor in inputs - ] - ) - - torch.onnx.export( - model, - fp32_inputs, - onnx_path, - export_params=True, - opset_version=self.onnx_opset, - do_constant_folding=True, - input_names=model_obj.get_input_names(), - output_names=model_obj.get_output_names(), - dynamic_axes=model_obj.get_dynamic_axes(), - ) - del model - torch.cuda.empty_cache() - gc.collect() - - # Optimize onnx - logger.info("Generating optimized model: %s", onnx_opt_path) - model_obj.optimize_ort(onnx_path, onnx_opt_path, to_fp16=fp16) - - for model_name in models: - engine_path = Engines.get_engine_path(engine_dir, model_name, profile_id) - engine = Engine(engine_path, self.provider, device_id=device_id, enable_cuda_graph=enable_cuda_graph) - logger.info("%s options for %s: %s", self.provider, model_name, engine.provider_options) - self.engines[model_name] = engine - - def get_engine(self, model_name): - return self.engines[model_name] - - -def run_engine(engine, feed_dict): - return engine.infer(feed_dict) - - -# ----------------------------------------------------------------------------------------------------- -# Utilities for both CUDA and TensorRT EP -# ----------------------------------------------------------------------------------------------------- - - -class StableDiffusionPipelineMixin: - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def encode_prompt(self, clip_engine, prompt, negative_prompt): - """ - Encodes the prompt into text encoder hidden states. - """ - - # Tokenize prompt - text_input_ids = ( - self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - .input_ids.type(torch.int32) - .to(self.torch_device) - ) - - # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt - text_embeddings = run_engine(clip_engine, {"input_ids": text_input_ids})["text_embeddings"].clone() - - # Tokenize negative prompt - uncond_input_ids = ( - self.tokenizer( - negative_prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - .input_ids.type(torch.int32) - .to(self.torch_device) - ) - - uncond_embeddings = run_engine(clip_engine, {"input_ids": uncond_input_ids})["text_embeddings"] - - # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16) - - return text_embeddings - - def denoise_latent( - self, - unet_engine, - latents, - text_embeddings, - timesteps=None, - mask=None, - masked_image_latents=None, - timestep_fp16=False, - ): - if not isinstance(timesteps, torch.Tensor): - timesteps = self.scheduler.timesteps - - for _step_index, timestep in enumerate(timesteps): - # Expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) - latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep) - if isinstance(mask, torch.Tensor): - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) - - # Predict the noise residual - timestep_float = timestep.to(torch.float16) if timestep_fp16 else timestep.to(torch.float32) - - noise_pred = run_engine( - unet_engine, - {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings}, - )["latent"] - - # Perform guidance - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample - - latents = 1.0 / 0.18215 * latents - return latents - - def decode_latent(self, vae_engine, latents): - images = run_engine(vae_engine, {"latent": latents})["images"] - images = (images / 2 + 0.5).clamp(0, 1) - return images.cpu().permute(0, 2, 3, 1).float().numpy() - - def set_cached_folder(self, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): - from diffusers.utils import DIFFUSERS_CACHE - from huggingface_hub import snapshot_download - - cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - local_files_only = kwargs.pop("local_files_only", False) - use_auth_token = kwargs.pop("use_auth_token", None) - revision = kwargs.pop("revision", None) - - self.cached_folder = ( - pretrained_model_name_or_path - if os.path.isdir(pretrained_model_name_or_path) - else snapshot_download( - pretrained_model_name_or_path, - cache_dir=cache_dir, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - ) - ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_img2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_img2img_xl.py deleted file mode 100644 index 31ede1ba901f2..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_img2img_xl.py +++ /dev/null @@ -1,236 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -# Modified from TensorRT demo diffusion, which has the following license: -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -------------------------------------------------------------------------- - -import time - -import torch -from diffusion_models import PipelineInfo -from pipeline_stable_diffusion import StableDiffusionPipeline - - -class Img2ImgXLPipeline(StableDiffusionPipeline): - """ - Stable Diffusion Img2Img XL pipeline. - """ - - def __init__(self, pipeline_info: PipelineInfo, *args, **kwargs): - """ - Initializes the Img2Img XL Diffusion pipeline. - - Args: - pipeline_info (PipelineInfo): - Version and Type of stable diffusion pipeline. - """ - assert pipeline_info.is_xl_refiner() - - super().__init__(pipeline_info, *args, **kwargs) - - self.requires_aesthetics_score = True - - def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype - ): - if self.requires_aesthetics_score: - add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,)) - else: - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_neg_time_ids = list(original_size + crops_coords_top_left + target_size) - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0).to(device=self.device) - return add_time_ids - - def _infer( - self, - prompt, - negative_prompt, - init_image, - image_height, - image_width, - denoising_steps=30, - strength=0.3, - guidance=5.0, - seed=None, - warmup=False, - return_type="image", - ): - assert negative_prompt is None or len(prompt) == len(negative_prompt) - - original_size = (image_height, image_width) - crops_coords_top_left = (0, 0) - target_size = (image_height, image_width) - - aesthetic_score = 6.0 - negative_aesthetic_score = 2.5 - - self.set_denoising_steps(denoising_steps) - self.set_random_seed(seed) - - with torch.inference_mode(), torch.autocast("cuda"): - batch_size = len(prompt) - - torch.cuda.synchronize() - e2e_tic = time.perf_counter() - - # Initialize timesteps - timesteps, t_start = self.initialize_timesteps(self.denoising_steps, strength) - - latent_timestep = timesteps[:1].repeat(batch_size) - - # CLIP text encoder 2 - text_embeddings, pooled_embeddings2 = self.encode_prompt( - prompt, - negative_prompt, - encoder="clip2", - tokenizer=self.tokenizer2, - pooled_outputs=True, - output_hidden_states=True, - ) - - # Time embeddings - add_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - dtype=text_embeddings.dtype, - ) - - add_time_ids = add_time_ids.repeat(batch_size, 1) - - add_kwargs = {"text_embeds": pooled_embeddings2, "time_ids": add_time_ids} - - # Pre-process input image - init_image = self.preprocess_images(batch_size, (init_image,))[0] - - # VAE encode init image - if init_image.shape[1] == 4: - init_latents = init_image - else: - init_latents = self.encode_image(init_image) - - # Add noise to latents using timesteps - noise = torch.randn(init_latents.shape, device=self.device, dtype=torch.float32, generator=self.generator) - latents = self.scheduler.add_noise(init_latents, noise, t_start, latent_timestep) - - # UNet denoiser - latents = self.denoise_latent( - latents, - text_embeddings, - timesteps=timesteps, - step_offset=t_start, - denoiser="unetxl", - guidance=guidance, - add_kwargs=add_kwargs, - ) - - with torch.inference_mode(): - # VAE decode latent - if return_type == "latent": - images = latents - else: - images = self.decode_latent(latents / self.vae_scaling_factor) - - torch.cuda.synchronize() - e2e_toc = time.perf_counter() - - perf_data = None - if not warmup: - print("SD-XL Refiner Pipeline") - perf_data = self.print_summary(e2e_tic, e2e_toc, batch_size) - - return images, perf_data - - def run( - self, - prompt, - negative_prompt, - init_image, - image_height, - image_width, - denoising_steps=30, - guidance=5.0, - strength=0.3, - seed=None, - warmup=False, - return_type="image", - ): - """ - Run the diffusion pipeline. - - Args: - prompt (str): - The text prompt to guide image generation. - negative_prompt (str): - The prompt not to guide the image generation. - init_image (tuple[torch.Tensor]): - Image from base pipeline. - image_height (int): - Height (in pixels) of the image to be generated. Must be a multiple of 8. - image_width (int): - Width (in pixels) of the image to be generated. Must be a multiple of 8. - denoising_steps (int): - Number of denoising steps. More steps usually lead to higher quality image at the expense of slower inference. - guidance (float): - Higher guidance scale encourages to generate images that are closely linked to the text prompt. - seed (int): - Seed for the random generator - warmup (bool): - Indicate if this is a warmup run. - return_type (str): - It can be "latent" or "image". - """ - - if self.is_backend_tensorrt(): - import tensorrt as trt - from trt_utilities import TRT_LOGGER - - with trt.Runtime(TRT_LOGGER): - return self._infer( - prompt, - negative_prompt, - init_image, - image_height, - image_width, - denoising_steps=denoising_steps, - strength=strength, - guidance=guidance, - seed=seed, - warmup=warmup, - return_type=return_type, - ) - else: - return self._infer( - prompt, - negative_prompt, - init_image, - image_height, - image_width, - denoising_steps=denoising_steps, - strength=strength, - guidance=guidance, - seed=seed, - warmup=warmup, - return_type=return_type, - ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index e18a68d3edef8..104ce984bd401 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -23,7 +23,8 @@ import os import pathlib import random -from typing import Any, Dict, List +import time +from typing import Any, Dict, List, Optional import numpy as np import nvtx @@ -35,6 +36,8 @@ from engine_builder_ort_cuda import OrtCudaEngineBuilder from engine_builder_ort_trt import OrtTensorrtEngineBuilder from engine_builder_tensorrt import TensorrtEngineBuilder +from engine_builder_torch import TorchEngineBuilder +from PIL import Image class StableDiffusionPipeline: @@ -49,12 +52,11 @@ def __init__( scheduler="DDIM", device="cuda", output_dir=".", - hf_token=None, verbose=False, nvtx_profile=False, use_cuda_graph=False, framework_model_dir="pytorch_model", - engine_type: EngineType = EngineType.ORT_TRT, + engine_type: EngineType = EngineType.ORT_CUDA, ): """ Initializes the Diffusion pipeline. @@ -70,8 +72,6 @@ def __init__( PyTorch device to run inference. Default: 'cuda' output_dir (str): Output directory for log files and image artifacts - hf_token (str): - HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints. verbose (bool): Enable verbose logging. nvtx_profile (bool): @@ -98,7 +98,6 @@ def __init__( print(f"[I] Create directory: {directory}") pathlib.Path(directory).mkdir(parents=True) - self.hf_token = hf_token self.device = device self.torch_device = torch.device(device, torch.cuda.current_device()) self.verbose = verbose @@ -118,24 +117,22 @@ def __init__( # backend engine self.engine_type = engine_type if engine_type == EngineType.TRT: - self.backend = TensorrtEngineBuilder(pipeline_info, max_batch_size, hf_token, device, use_cuda_graph) + self.backend = TensorrtEngineBuilder(pipeline_info, max_batch_size, device, use_cuda_graph) elif engine_type == EngineType.ORT_TRT: - self.backend = OrtTensorrtEngineBuilder(pipeline_info, max_batch_size, hf_token, device, use_cuda_graph) + self.backend = OrtTensorrtEngineBuilder(pipeline_info, max_batch_size, device, use_cuda_graph) elif engine_type == EngineType.ORT_CUDA: - self.backend = OrtCudaEngineBuilder(pipeline_info, max_batch_size, hf_token, device, use_cuda_graph) + self.backend = OrtCudaEngineBuilder(pipeline_info, max_batch_size, device, use_cuda_graph) + elif engine_type == EngineType.TORCH: + self.backend = TorchEngineBuilder(pipeline_info, max_batch_size, device, use_cuda_graph) else: raise RuntimeError(f"Backend engine type {engine_type.name} is not supported") # Load text tokenizer if not self.pipeline_info.is_xl_refiner(): - self.tokenizer = get_tokenizer( - self.pipeline_info, self.framework_model_dir, self.hf_token, subfolder="tokenizer" - ) + self.tokenizer = get_tokenizer(self.pipeline_info, self.framework_model_dir, subfolder="tokenizer") if self.pipeline_info.is_xl(): - self.tokenizer2 = get_tokenizer( - self.pipeline_info, self.framework_model_dir, self.hf_token, subfolder="tokenizer_2" - ) + self.tokenizer2 = get_tokenizer(self.pipeline_info, self.framework_model_dir, subfolder="tokenizer_2") self.control_image_processor = None if self.pipeline_info.is_xl() and self.pipeline_info.controlnet: @@ -147,7 +144,7 @@ def __init__( # Create CUDA events self.events = {} - for stage in ["clip", "denoise", "vae", "vae_encoder"]: + for stage in ["clip", "denoise", "vae", "vae_encoder", "pil"]: for marker in ["start", "stop"]: self.events[stage + "-" + marker] = cudart.cudaEventCreate()[1] self.markers = {} @@ -211,7 +208,7 @@ def run_engine(self, model_name, feed_dict): return self.backend.run_engine(model_name, feed_dict) def initialize_latents(self, batch_size, unet_channels, latent_height, latent_width): - latents_dtype = torch.float32 # text_embeddings.dtype + latents_dtype = torch.float16 latents_shape = (batch_size, unet_channels, latent_height, latent_width) latents = torch.randn(latents_shape, device=self.device, dtype=latents_dtype, generator=self.generator) # Scale the initial noise by the standard deviation required by the scheduler @@ -219,6 +216,7 @@ def initialize_latents(self, batch_size, unet_channels, latent_height, latent_wi return latents def initialize_timesteps(self, timesteps, strength): + """Initialize timesteps for refiner.""" self.scheduler.set_timesteps(timesteps) offset = self.scheduler.steps_offset if hasattr(self.scheduler, "steps_offset") else 0 init_timestep = int(timesteps * strength) + offset @@ -227,6 +225,51 @@ def initialize_timesteps(self, timesteps, strength): timesteps = self.scheduler.timesteps[t_start:].to(self.device) return timesteps, t_start + def initialize_refiner(self, batch_size, image, strength): + """Add noise to a reference image.""" + # Initialize timesteps + timesteps, t_start = self.initialize_timesteps(self.denoising_steps, strength) + + latent_timestep = timesteps[:1].repeat(batch_size) + + # Pre-process input image + image = self.preprocess_images(batch_size, (image,))[0] + + # VAE encode init image + if image.shape[1] == 4: + init_latents = image + else: + init_latents = self.encode_image(image) + + # Add noise to latents using timesteps + noise = torch.randn(init_latents.shape, device=self.device, dtype=torch.float16, generator=self.generator) + + latents = self.scheduler.add_noise(init_latents, noise, t_start, latent_timestep) + + return timesteps, t_start, latents + + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + dtype, + requires_aesthetics_score, + ): + if requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,)) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(original_size + crops_coords_top_left + target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids + def start_profile(self, name, color="blue"): if self.nvtx_profile: self.markers[name] = nvtx.start_range(message=name, color=color) @@ -245,7 +288,7 @@ def preprocess_images(self, batch_size, images=()): self.start_profile("preprocess", color="pink") init_images = [] for i in images: - image = i.to(self.device).float() + image = i.to(self.device) if image.shape[0] != batch_size: image = image.repeat(batch_size, 1, 1, 1) init_images.append(image) @@ -296,30 +339,46 @@ def encode_prompt( output_hidden_states=False, force_zeros_for_empty_prompt=False, do_classifier_free_guidance=True, + dtype=torch.float16, ): if tokenizer is None: tokenizer = self.tokenizer self.start_profile("clip", color="green") - # Tokenize prompt - text_input_ids = ( - tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", + def tokenize(prompt, output_hidden_states): + text_input_ids = ( + tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + .input_ids.type(torch.int32) + .to(self.device) ) - .input_ids.type(torch.int32) - .to(self.device) - ) + + hidden_states = None + if self.engine_type == EngineType.TORCH: + outputs = self.backend.engines[encoder](text_input_ids) + text_embeddings = outputs[0] + if output_hidden_states: + hidden_states = outputs["last_hidden_state"] + else: + outputs = self.run_engine(encoder, {"input_ids": text_input_ids}) + text_embeddings = outputs["text_embeddings"] + if output_hidden_states: + hidden_states = outputs["hidden_states"] + return text_embeddings, hidden_states + + # Tokenize prompt + text_embeddings, hidden_states = tokenize(prompt, output_hidden_states) # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt - outputs = self.run_engine(encoder, {"input_ids": text_input_ids}) - text_embeddings = outputs["text_embeddings"].clone() - if output_hidden_states: - hidden_states = outputs["hidden_states"].clone() + text_embeddings = text_embeddings.clone() + if hidden_states is not None: + hidden_states = hidden_states.clone() # Note: negative prompt embedding is not needed for SD XL when guidance <= 1 if do_classifier_free_guidance: @@ -331,40 +390,26 @@ def encode_prompt( uncond_hidden_states = torch.zeros_like(hidden_states) else: # Tokenize negative prompt - uncond_input_ids = ( - tokenizer( - negative_prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - .input_ids.type(torch.int32) - .to(self.device) - ) - - outputs = self.run_engine(encoder, {"input_ids": uncond_input_ids}) - uncond_embeddings = outputs["text_embeddings"] - if output_hidden_states: - uncond_hidden_states = outputs["hidden_states"] + uncond_embeddings, uncond_hidden_states = tokenize(negative_prompt, output_hidden_states) # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) - if pooled_outputs: - pooled_output = text_embeddings - - if output_hidden_states: - if do_classifier_free_guidance: - text_embeddings = torch.cat([uncond_hidden_states, hidden_states]) - else: - text_embeddings = hidden_states + if output_hidden_states: + hidden_states = torch.cat([uncond_hidden_states, hidden_states]) self.stop_profile("clip") if pooled_outputs: - return text_embeddings.to(dtype=torch.float16), pooled_output.to(dtype=torch.float16) - return text_embeddings.to(dtype=torch.float16) + # For text encoder in sdxl base + return hidden_states.to(dtype=dtype), text_embeddings.to(dtype=dtype) + + if output_hidden_states: + # For text encoder 2 in sdxl base or refiner + return hidden_states.to(dtype=dtype) + + # For text encoder in sd 1.5 + return text_embeddings.to(dtype=dtype) def denoise_latent( self, @@ -373,8 +418,6 @@ def denoise_latent( denoiser="unet", timesteps=None, step_offset=0, - mask=None, - masked_image_latents=None, guidance=7.5, add_kwargs=None, ): @@ -393,18 +436,13 @@ def denoise_latent( latent_model_input, step_offset + step_index, timestep ) - if isinstance(mask, torch.Tensor): - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) - # Predict the noise residual if self.nvtx_profile: nvtx_unet = nvtx.start_range(message="unet", color="blue") - timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep - params = { "sample": latent_model_input, - "timestep": timestep_float, + "timestep": timestep.to(latents.dtype), "encoder_hidden_states": text_embeddings, } @@ -434,9 +472,9 @@ def denoise_latent( self.stop_profile("denoise") return latents - def encode_image(self, init_image): + def encode_image(self, image): self.start_profile("vae_encoder", color="red") - init_latents = self.run_engine("vae_encoder", {"images": init_image})["latent"] + init_latents = self.run_engine("vae_encoder", {"images": image})["latent"] init_latents = self.vae_scaling_factor * init_latents self.stop_profile("vae_encoder") return init_latents @@ -447,7 +485,7 @@ def decode_latent(self, latents): self.stop_profile("vae") return images - def print_summary(self, tic, toc, batch_size, vae_enc=False) -> Dict[str, Any]: + def print_summary(self, tic, toc, batch_size, vae_enc=False, pil=False) -> Dict[str, Any]: throughput = batch_size / (toc - tic) latency_clip = cudart.cudaEventElapsedTime(self.events["clip-start"], self.events["clip-stop"])[1] latency_unet = cudart.cudaEventElapsedTime(self.events["denoise-start"], self.events["denoise-stop"])[1] @@ -457,6 +495,8 @@ def print_summary(self, tic, toc, batch_size, vae_enc=False) -> Dict[str, Any]: if vae_enc else None ) + latency_pil = cudart.cudaEventElapsedTime(self.events["pil-start"], self.events["pil-stop"])[1] if pil else None + latency = (toc - tic) * 1000.0 print("|----------------|--------------|") @@ -472,9 +512,11 @@ def print_summary(self, tic, toc, batch_size, vae_enc=False) -> Dict[str, Any]: ) ) print("| {:^14} | {:>9.2f} ms |".format("VAE-Dec", latency_vae)) - + pipeline = "Refiner" if self.pipeline_info.is_xl_refiner() else "Pipeline" + if pil: + print("| {:^14} | {:>9.2f} ms |".format("PIL", latency_pil)) print("|----------------|--------------|") - print("| {:^14} | {:>9.2f} ms |".format("Pipeline", latency)) + print(f"| {pipeline:^14} | {latency:>9.2f} ms |") print("|----------------|--------------|") print(f"Throughput: {throughput:.2f} image/s") @@ -482,6 +524,7 @@ def print_summary(self, tic, toc, batch_size, vae_enc=False) -> Dict[str, Any]: "latency_clip": latency_clip, "latency_unet": latency_unet, "latency_vae": latency_vae, + "latency_pil": latency_pil, "latency": latency, "throughput": throughput, } @@ -490,15 +533,19 @@ def print_summary(self, tic, toc, batch_size, vae_enc=False) -> Dict[str, Any]: return perf_data @staticmethod - def to_pil_image(images): + def pt_to_pil(images): images = ( ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy() ) - - from PIL import Image - return [Image.fromarray(images[i]) for i in range(images.shape[0])] + @staticmethod + def pt_to_numpy(images: torch.FloatTensor): + """ + Convert a PyTorch tensor to a NumPy image. + """ + return ((images + 1) / 2).clamp(0, 1).detach().permute(0, 2, 3, 1).float().cpu().numpy() + def metadata(self) -> Dict[str, Any]: return { "actual_steps": self.actual_steps, @@ -509,7 +556,6 @@ def metadata(self) -> Dict[str, Any]: } def save_images(self, images: List, prompt: List[str], negative_prompt: List[str], metadata: Dict[str, Any]): - images = self.to_pil_image(images) session_id = str(random.randint(1000, 9999)) for i, image in enumerate(images): seed = str(self.get_current_seed()) @@ -527,3 +573,249 @@ def save_images(self, images: List, prompt: List[str], negative_prompt: List[str info.add_text("negative_prompt", negative_prompt[i]) image.save(image_path, "PNG", pnginfo=info) + + def _infer( + self, + prompt, + negative_prompt, + image_height, + image_width, + denoising_steps=30, + guidance=5.0, + seed=None, + image=None, + strength=0.3, + controlnet_images=None, + controlnet_scales=None, + show_latency=False, + output_type="pil", + ): + if show_latency: + torch.cuda.synchronize() + start_time = time.perf_counter() + + assert len(prompt) == len(negative_prompt) + batch_size = len(prompt) + + self.set_denoising_steps(denoising_steps) + self.set_random_seed(seed) + + timesteps = None + step_offset = 0 + with torch.inference_mode(), torch.autocast("cuda"): + if image is not None: + timesteps, step_offset, latents = self.initialize_refiner( + batch_size=batch_size, + image=image, + strength=strength, + ) + else: + # Pre-initialize latents + latents = self.initialize_latents( + batch_size=batch_size, + unet_channels=4, + latent_height=(image_height // 8), + latent_width=(image_width // 8), + ) + + do_classifier_free_guidance = guidance > 1.0 + if not self.pipeline_info.is_xl(): + denoiser = "unet" + text_embeddings = self.encode_prompt( + prompt, + negative_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + dtype=latents.dtype, + ) + add_kwargs = {} + else: + denoiser = "unetxl" + + # Time embeddings + original_size = (image_height, image_width) + crops_coords_top_left = (0, 0) + target_size = (image_height, image_width) + aesthetic_score = 6.0 + negative_aesthetic_score = 2.5 + add_time_ids, add_negative_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + dtype=latents.dtype, + requires_aesthetics_score=self.pipeline_info.is_xl_refiner(), + ) + if do_classifier_free_guidance: + add_time_ids = torch.cat([add_negative_time_ids, add_time_ids], dim=0) + add_time_ids = add_time_ids.to(device=self.device).repeat(batch_size, 1) + + if self.pipeline_info.is_xl_refiner(): + # CLIP text encoder 2 + text_embeddings, pooled_embeddings2 = self.encode_prompt( + prompt, + negative_prompt, + encoder="clip2", + tokenizer=self.tokenizer2, + pooled_outputs=True, + output_hidden_states=True, + dtype=latents.dtype, + ) + add_kwargs = {"text_embeds": pooled_embeddings2, "time_ids": add_time_ids} + else: # XL Base + # CLIP text encoder + text_embeddings = self.encode_prompt( + prompt, + negative_prompt, + encoder="clip", + tokenizer=self.tokenizer, + output_hidden_states=True, + force_zeros_for_empty_prompt=True, + do_classifier_free_guidance=do_classifier_free_guidance, + dtype=latents.dtype, + ) + # CLIP text encoder 2 + text_embeddings2, pooled_embeddings2 = self.encode_prompt( + prompt, + negative_prompt, + encoder="clip2", + tokenizer=self.tokenizer2, + pooled_outputs=True, + output_hidden_states=True, + force_zeros_for_empty_prompt=True, + do_classifier_free_guidance=do_classifier_free_guidance, + dtype=latents.dtype, + ) + + # Merged text embeddings + text_embeddings = torch.cat([text_embeddings, text_embeddings2], dim=-1) + + add_kwargs = {"text_embeds": pooled_embeddings2, "time_ids": add_time_ids} + + if self.pipeline_info.controlnet: + controlnet_images = self.preprocess_controlnet_images( + latents.shape[0], + controlnet_images, + do_classifier_free_guidance=do_classifier_free_guidance, + height=image_height, + width=image_width, + ) + add_kwargs.update( + { + "controlnet_images": controlnet_images, + "controlnet_scales": controlnet_scales.to(controlnet_images.dtype).to(controlnet_images.device), + } + ) + + # UNet denoiser + latents = self.denoise_latent( + latents, + text_embeddings, + timesteps=timesteps, + step_offset=step_offset, + denoiser=denoiser, + guidance=guidance, + add_kwargs=add_kwargs, + ) + + with torch.inference_mode(): + # VAE decode latent + if output_type == "latent": + images = latents + else: + images = self.decode_latent(latents / self.vae_scaling_factor) + if output_type == "pil": + self.start_profile("pil", color="green") + images = self.pt_to_pil(images) + self.stop_profile("pil") + + perf_data = None + if show_latency: + torch.cuda.synchronize() + end_time = time.perf_counter() + perf_data = self.print_summary( + start_time, end_time, batch_size, vae_enc=self.pipeline_info.is_xl_refiner(), pil=(output_type == "pil") + ) + + return images, perf_data + + def run( + self, + prompt: List[str], + negative_prompt: List[str], + image_height: int, + image_width: int, + denoising_steps: int = 30, + guidance: float = 5.0, + seed: Optional[int] = None, + image: Optional[torch.Tensor] = None, + strength: float = 0.3, + controlnet_images: Optional[torch.Tensor] = None, + controlnet_scales: Optional[torch.Tensor] = None, + show_latency: bool = False, + output_type: str = "pil", + ): + """ + Run the diffusion pipeline. + + Args: + prompt (List[str]): + The text prompt to guide image generation. + negative_prompt (List[str]): + The prompt not to guide the image generation. + image_height (int): + Height (in pixels) of the image to be generated. Must be a multiple of 8. + image_width (int): + Width (in pixels) of the image to be generated. Must be a multiple of 8. + denoising_steps (int): + Number of denoising steps. More steps usually lead to higher quality image at the expense of slower inference. + guidance (float): + Higher guidance scale encourages to generate images that are closely linked to the text prompt. + seed (int): + Seed for the random generator + image (tuple[torch.Tensor]): + Reference image. + strength (float): + Indicates extent to transform the reference image, which is used as a starting point, + and more noise is added the higher the strength. + show_latency (bool): + Whether return latency data. + output_type (str): + It can be "latent", "pt" or "pil". + """ + if self.is_backend_tensorrt(): + import tensorrt as trt + from trt_utilities import TRT_LOGGER + + with trt.Runtime(TRT_LOGGER): + return self._infer( + prompt, + negative_prompt, + image_height, + image_width, + denoising_steps=denoising_steps, + guidance=guidance, + seed=seed, + image=image, + strength=strength, + controlnet_images=controlnet_images, + controlnet_scales=controlnet_scales, + show_latency=show_latency, + output_type=output_type, + ) + else: + return self._infer( + prompt, + negative_prompt, + image_height, + image_width, + denoising_steps=denoising_steps, + guidance=guidance, + seed=seed, + image=image, + strength=strength, + controlnet_images=controlnet_images, + controlnet_scales=controlnet_scales, + show_latency=show_latency, + output_type=output_type, + ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img.py deleted file mode 100644 index 2d2fdb542c845..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img.py +++ /dev/null @@ -1,178 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -# Modified from TensorRT demo diffusion, which has the following license: -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -------------------------------------------------------------------------- - -import time - -import torch -from diffusion_models import PipelineInfo -from pipeline_stable_diffusion import StableDiffusionPipeline - - -class Txt2ImgPipeline(StableDiffusionPipeline): - """ - Stable Diffusion Txt2Img pipeline using NVidia TensorRT. - """ - - def __init__(self, pipeline_info: PipelineInfo, **kwargs): - """ - Initializes the Txt2Img Diffusion pipeline. - - Args: - pipeline_info (PipelineInfo): - Version and Type of stable diffusion pipeline. - """ - super().__init__(pipeline_info, **kwargs) - - def _infer( - self, - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=50, - guidance=7.5, - seed=None, - controlnet_images=None, - controlnet_scales=None, - warmup=False, - return_type="latent", - ): - assert len(prompt) == len(negative_prompt) - batch_size = len(prompt) - - self.set_denoising_steps(denoising_steps) - self.set_random_seed(seed) - - with torch.inference_mode(), torch.autocast("cuda"): - # Pre-initialize latents - latents = self.initialize_latents( - batch_size=batch_size, - unet_channels=4, - latent_height=(image_height // 8), - latent_width=(image_width // 8), - ) - - torch.cuda.synchronize() - e2e_tic = time.perf_counter() - - # CLIP text encoder - do_classifier_free_guidance = guidance > 1.0 - text_embeddings = self.encode_prompt( - prompt, - negative_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - ) - - add_kwargs = None - if self.pipeline_info.controlnet: - controlnet_images = self.preprocess_controlnet_images( - latents.shape[0], controlnet_images, do_classifier_free_guidance=do_classifier_free_guidance - ) - add_kwargs = { - "controlnet_images": controlnet_images, - "controlnet_scales": controlnet_scales.to(controlnet_images.dtype).to(controlnet_images.device), - } - - # UNet denoiser - latents = self.denoise_latent(latents, text_embeddings, guidance=guidance, add_kwargs=add_kwargs) - - # VAE decode latent - images = self.decode_latent(latents / self.vae_scaling_factor) - - torch.cuda.synchronize() - e2e_toc = time.perf_counter() - - perf_data = None - if not warmup: - perf_data = self.print_summary(e2e_tic, e2e_toc, batch_size) - - return images, perf_data - - def run( - self, - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=30, - guidance=7.5, - seed=None, - controlnet_images=None, - controlnet_scales=None, - warmup=False, - return_type="image", - ): - """ - Run the diffusion pipeline. - - Args: - prompt (str): - The text prompt to guide image generation. - negative_prompt (str): - The prompt not to guide the image generation. - image_height (int): - Height (in pixels) of the image to be generated. Must be a multiple of 8. - image_width (int): - Width (in pixels) of the image to be generated. Must be a multiple of 8. - denoising_steps (int): - Number of denoising steps. More steps usually lead to higher quality image at the expense of slower inference. - guidance (float): - Higher guidance scale encourages to generate images that are closely linked to the text prompt. - seed (int): - Seed for the random generator - warmup (bool): - Indicate if this is a warmup run. - return_type (str): - type of return. The value can be "latent" or "image". - """ - if self.is_backend_tensorrt(): - import tensorrt as trt - from trt_utilities import TRT_LOGGER - - with trt.Runtime(TRT_LOGGER): - return self._infer( - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=denoising_steps, - guidance=guidance, - seed=seed, - controlnet_images=controlnet_images, - controlnet_scales=controlnet_scales, - warmup=warmup, - return_type=return_type, - ) - else: - return self._infer( - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=denoising_steps, - guidance=guidance, - seed=seed, - controlnet_images=controlnet_images, - controlnet_scales=controlnet_scales, - warmup=warmup, - return_type=return_type, - ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img_xl.py deleted file mode 100644 index fa0035494217b..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_txt2img_xl.py +++ /dev/null @@ -1,231 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -# Modified from TensorRT demo diffusion, which has the following license: -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -------------------------------------------------------------------------- - -import time - -import torch -from diffusion_models import PipelineInfo -from pipeline_stable_diffusion import StableDiffusionPipeline - - -class Txt2ImgXLPipeline(StableDiffusionPipeline): - """ - Stable Diffusion Txt2Img XL pipeline. - """ - - def __init__(self, pipeline_info: PipelineInfo, *args, **kwargs): - """ - Initializes the Txt2Img XL Diffusion pipeline. - - Args: - pipeline_info (PipelineInfo): - Version and Type of stable diffusion pipeline. - """ - assert pipeline_info.is_xl_base_or_turbo() - - super().__init__(pipeline_info, *args, **kwargs) - - def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype): - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - return add_time_ids - - def _infer( - self, - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=30, - guidance=5.0, - seed=None, - controlnet_images=None, - controlnet_scales=None, - warmup=False, - return_type="image", - ): - assert len(prompt) == len(negative_prompt) - do_classifier_free_guidance = guidance > 1.0 - original_size = (image_height, image_width) - crops_coords_top_left = (0, 0) - target_size = (image_height, image_width) - batch_size = len(prompt) - - self.set_denoising_steps(denoising_steps) - self.set_random_seed(seed) - - with torch.inference_mode(), torch.autocast("cuda"): - # Pre-initialize latents - latents = self.initialize_latents( - batch_size=batch_size, - unet_channels=4, - latent_height=(image_height // 8), - latent_width=(image_width // 8), - ) - - torch.cuda.synchronize() - e2e_tic = time.perf_counter() - - # CLIP text encoder - text_embeddings = self.encode_prompt( - prompt, - negative_prompt, - encoder="clip", - tokenizer=self.tokenizer, - output_hidden_states=True, - force_zeros_for_empty_prompt=True, - do_classifier_free_guidance=do_classifier_free_guidance, - ) - # CLIP text encoder 2 - text_embeddings2, pooled_embeddings2 = self.encode_prompt( - prompt, - negative_prompt, - encoder="clip2", - tokenizer=self.tokenizer2, - pooled_outputs=True, - output_hidden_states=True, - force_zeros_for_empty_prompt=True, - do_classifier_free_guidance=do_classifier_free_guidance, - ) - - # Merged text embeddings - text_embeddings = torch.cat([text_embeddings, text_embeddings2], dim=-1) - - # Time embeddings - add_time_ids = self._get_add_time_ids( - original_size, crops_coords_top_left, target_size, dtype=text_embeddings.dtype - ) - add_time_ids = add_time_ids.repeat(batch_size, 1) - if do_classifier_free_guidance: - add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0) - - add_kwargs = {"text_embeds": pooled_embeddings2, "time_ids": add_time_ids.to(self.device)} - if self.pipeline_info.controlnet: - controlnet_images = self.preprocess_controlnet_images( - latents.shape[0], - controlnet_images, - do_classifier_free_guidance=do_classifier_free_guidance, - height=image_height, - width=image_width, - ) - add_kwargs.update( - { - "controlnet_images": controlnet_images, - "controlnet_scales": controlnet_scales.to(controlnet_images.dtype).to(controlnet_images.device), - } - ) - - # UNet denoiser - latents = self.denoise_latent( - latents, - text_embeddings, - denoiser="unetxl", - guidance=guidance, - add_kwargs=add_kwargs, - ) - - # VAE decode latent - if return_type == "latent": - images = latents - else: - images = self.decode_latent(latents / self.vae_scaling_factor) - - torch.cuda.synchronize() - e2e_toc = time.perf_counter() - - perf_data = None - if not warmup: - print("SD-XL Base Pipeline") - perf_data = self.print_summary(e2e_tic, e2e_toc, batch_size) - - return images, perf_data - - def run( - self, - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=30, - guidance=5.0, - seed=None, - controlnet_images=None, - controlnet_scales=None, - warmup=False, - return_type="image", - ): - """ - Run the diffusion pipeline. - - Args: - prompt (str): - The text prompt to guide image generation. - negative_prompt (str): - The prompt not to guide the image generation. - image_height (int): - Height (in pixels) of the image to be generated. Must be a multiple of 8. - image_width (int): - Width (in pixels) of the image to be generated. Must be a multiple of 8. - denoising_steps (int): - Number of denoising steps. More steps usually lead to higher quality image at the expense of slower inference. - guidance (float): - Higher guidance scale encourages to generate images that are closely linked to the text prompt. - seed (int): - Seed for the random generator - warmup (bool): - Indicate if this is a warmup run. - return_type (str): - It can be "latent" or "image". - """ - - if self.is_backend_tensorrt(): - import tensorrt as trt - from trt_utilities import TRT_LOGGER - - with trt.Runtime(TRT_LOGGER): - return self._infer( - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=denoising_steps, - guidance=guidance, - seed=seed, - controlnet_images=controlnet_images, - controlnet_scales=controlnet_scales, - warmup=warmup, - return_type=return_type, - ) - else: - return self._infer( - prompt, - negative_prompt, - image_height, - image_width, - denoising_steps=denoising_steps, - guidance=guidance, - seed=seed, - controlnet_images=controlnet_images, - controlnet_scales=controlnet_scales, - warmup=warmup, - return_type=return_type, - ) diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 7bdbc08cf733a..37b39c91b5c15 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -1311,3 +1311,119 @@ def use_float16(self): queue = sub_graphs return False + + def change_graph_input_type( + self, + graph_input: ValueInfoProto, + new_type: int, + ): + """Change graph input type, and add Cast node if needed. + + Args: + graph_input (ValueInfoProto): input of the graph + new_type (int): new data type like TensorProto.INT32. + + Returns: + NodeProto: a new Cast node that added. None if Cast node is not added. + List[NodeProto]: Cast nodes that have been removed. + """ + assert isinstance(graph_input, ValueInfoProto) + assert self.find_graph_input(graph_input.name) + + if graph_input.type.tensor_type.elem_type == int(new_type): + return None, [] + + graph = self.graph() + new_cast_node = None + nodes_to_remove = [] + + input_name_to_nodes = self.input_name_to_nodes() + if graph_input.name in input_name_to_nodes: + nodes = input_name_to_nodes[graph_input.name] + + # For children that is not Cast node, insert a Cast node to convert int32 to original data type. + nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] + if nodes_not_cast: + node_name = self.create_node_name("Cast") + output_name = node_name + "_" + graph_input.name + new_value_info = graph.value_info.add() + new_value_info.CopyFrom(graph_input) + new_value_info.name = output_name + new_cast_node = helper.make_node( + "Cast", + [graph_input.name], + [output_name], + to=int(graph_input.type.tensor_type.elem_type), + name=node_name, + ) + graph.node.extend([new_cast_node]) + + for node in nodes_not_cast: + OnnxModel.replace_node_input(node, graph_input.name, output_name) + + # For children that is Cast node, no need to insert Cast. + # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. + nodes_cast = [node for node in nodes if node.op_type == "Cast"] + for node in nodes_cast: + if OnnxModel.get_node_attribute(node, "to") == int(new_type): + self.replace_input_of_all_nodes(node.output[0], graph_input.name) + if not self.find_graph_output(node.output[0]): + nodes_to_remove.append(node) + if nodes_to_remove: + self.remove_nodes(nodes_to_remove) + + graph_input.type.tensor_type.elem_type = int(new_type) + return new_cast_node, nodes_to_remove + + def change_graph_output_type( + self, + graph_output: ValueInfoProto, + new_type: int, + ): + """Change graph input type, and add Cast node if needed. + + Args: + graph_input (str | ValueInfoProto): output of the graph + new_type (int): new data type. + + Returns: + NodeProto: a new Cast node that added. None if Cast node is not added. + """ + assert isinstance(graph_output, ValueInfoProto) + assert self.find_graph_output(graph_output.name) + + if graph_output.type.tensor_type.elem_type == int(new_type): + return None + + cast_node = None + graph = self.graph() + + # Add a cast node + node_name = self.create_node_name("Cast") + input_name = node_name + "_" + graph_output.name + self.replace_input_of_all_nodes(graph_output.name, input_name) + new_value_info = graph.value_info.add() + new_value_info.CopyFrom(graph_output) + new_value_info.name = input_name + cast_node = helper.make_node( + "Cast", + [input_name], + [graph_output.name], + to=int(new_type), + name=node_name, + ) + graph.node.extend([cast_node]) + graph_output.type.tensor_type.elem_type = int(new_type) + return cast_node + + def rename_graph_output(self, old_name: str, new_name: str): + if new_name in self.output_name_to_node(): + raise RuntimeError("{new_name} exists in graph") + + graph = self.graph() + for output in graph.output: + if output.name == old_name: + logger.debug("replace output name from %s to %s", old_name, new_name) + self.replace_input_of_all_nodes(old_name, new_name) + self.replace_output_of_all_nodes(old_name, new_name) + output.name = new_name diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py index 882100a0d019e..51deb67ce5bf3 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_bert.py +++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py @@ -27,7 +27,7 @@ from fusion_simplified_layernorm import FusionSimplifiedLayerNormalization, FusionSkipSimplifiedLayerNormalization from fusion_skiplayernorm import FusionBiasSkipLayerNormalization, FusionSkipLayerNormalization from fusion_utils import FusionUtils -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper +from onnx import ModelProto, TensorProto, helper from onnx_model import OnnxModel logger = getLogger(__name__) @@ -170,78 +170,13 @@ def get_graph_inputs_from_fused_nodes(self, casted: bool): inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) return inputs - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - def change_graph_inputs_to_int32(self): """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" graph = self.graph() add_cast_count = 0 remove_cast_count = 0 for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type(graph, graph_input, TensorProto.INT32) + new_node, removed_nodes = self.change_graph_input_type(graph_input, TensorProto.INT32) if new_node: add_cast_count += 1 remove_cast_count += len(removed_nodes)