From bf462c650b7f1be7f98eda68db86ca47dd2af552 Mon Sep 17 00:00:00 2001 From: WangYi Date: Wed, 7 Aug 2024 11:59:44 +0800 Subject: [PATCH 1/4] support flux --- benchmarks/text_to_image.py | 15 +++++++++++++++ .../compilers/diffusion_pipeline_compiler.py | 1 + 2 files changed, 16 insertions(+) diff --git a/benchmarks/text_to_image.py b/benchmarks/text_to_image.py index 85ec6bb43..8164c50ed 100644 --- a/benchmarks/text_to_image.py +++ b/benchmarks/text_to_image.py @@ -46,6 +46,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default=MODEL) + parser.add_argument("--dtype", type=str, default="fp16") parser.add_argument("--variant", type=str, default=VARIANT) parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) parser.add_argument("--scheduler", type=str, default=SCHEDULER) @@ -92,6 +93,8 @@ def parse_args(): default=QUANTIZE_CONFIG, ) parser.add_argument("--quant-submodules-config-path", type=str, default=None) + parser.add_argument("--custom-revision", type=str, default=None) + parser.add_argument("--local-files-only", action="store_true") return parser.parse_args() @@ -108,6 +111,8 @@ def load_pipe( scheduler=None, lora=None, controlnet=None, + custom_revision=None, + local_files_only=False, ): extra_kwargs = {} if custom_pipeline is not None: @@ -115,6 +120,8 @@ def load_pipe( if variant is not None: extra_kwargs["variant"] = variant if dtype is not None: + dtype = getattr(torch, dtype) + assert isinstance(dtype, torch.dtype) extra_kwargs["torch_dtype"] = dtype if controlnet is not None: from diffusers import ControlNetModel @@ -124,6 +131,11 @@ def load_pipe( torch_dtype=dtype, ) extra_kwargs["controlnet"] = controlnet + if custom_revision is not None: + extra_kwargs["custom_revision"] = custom_revision + if local_files_only: + extra_kwargs["local_files_only"] = True + if os.path.exists(os.path.join(model_name, "calibrate_info.txt")): from onediff.quantization import QuantPipeline @@ -231,11 +243,14 @@ def main(): pipe = load_pipe( pipeline_cls, args.model, + dtype=args.dtype, variant=args.variant, custom_pipeline=args.custom_pipeline, scheduler=args.scheduler, lora=args.lora, controlnet=args.controlnet, + custom_revision=args.custom_revision, + local_files_only=args.local_files_only, ) core_net = None diff --git a/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py b/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py index e7907f37f..fba45330d 100644 --- a/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py +++ b/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py @@ -37,6 +37,7 @@ def _recursive_setattr(obj, attr, value): "vqgan.up_blocks", # for StableCascadeDecoderPipeline "vae.decoder", "vae.encoder", + "transformer", ] From c50496c197a20b04da8cb2930c861798867b1ddf Mon Sep 17 00:00:00 2001 From: WangYi Date: Wed, 7 Aug 2024 13:42:22 +0800 Subject: [PATCH 2/4] add example for flux --- .../examples/text_to_image_flux.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 onediff_diffusers_extensions/examples/text_to_image_flux.py diff --git a/onediff_diffusers_extensions/examples/text_to_image_flux.py b/onediff_diffusers_extensions/examples/text_to_image_flux.py new file mode 100644 index 000000000..15ad0427d --- /dev/null +++ b/onediff_diffusers_extensions/examples/text_to_image_flux.py @@ -0,0 +1,87 @@ +import argparse +import time + +import cv2 +import numpy as np +import torch + +from diffusers import FluxPipeline +from PIL import Image + +parser = argparse.ArgumentParser() +parser.add_argument("--base", type=str, default="black-forest-labs/FLUX.1-schnell") +parser.add_argument( + "--prompt", + type=str, + default="chinese painting style women", +) +parser.add_argument("--height", type=int, default=512) +parser.add_argument("--width", type=int, default=512) +parser.add_argument("--n_steps", type=int, default=4) +parser.add_argument( + "--saved_image", type=str, required=False, default="flux-out.png" +) +parser.add_argument("--seed", type=int, default=1) +parser.add_argument("--warmup", type=int, default=1) +parser.add_argument("--run", type=int, default=3) +parser.add_argument( + "--compile", type=(lambda x: str(x).lower() in ["true", "1", "yes"]), default=True +) +args = parser.parse_args() + + +# load stable diffusion +import ipdb; ipdb.set_trace() +# pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16, local_files_only=True, custom_revision="93424e3") +pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16, custom_revision="93424e3a1530639fefdf08d2a7a954312e5cb254") +# pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16) +# pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float16) +pipe.to("cuda") + +# import ipdb; ipdb.set_trace() + +if args.compile: + from onediffx import compile_pipe + + pipe = compile_pipe(pipe, backend="nexfort") + + +# generate image +generator = torch.manual_seed(args.seed) + +print("Warmup") +for i in range(args.warmup): + image = pipe( + args.prompt, + height=args.height, + width=args.width, + output_type="pil", + num_inference_steps=args.n_steps, #use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed) + ).images[0] + + +print("Run") +for i in range(args.run): + begin = time.time() + image = pipe( + args.prompt, + height=args.height, + width=args.width, + output_type="pil", + num_inference_steps=args.n_steps, #use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed) + ).images[0] + end = time.time() + print(f"Inference time: {end - begin:.3f}s") + + image.save(f"{i=}th_{args.saved_image}.png") + +image = pipe( + args.prompt, + height=args.height // 2, + width=args.width // 2, + output_type="pil", + num_inference_steps=args.n_steps, #use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed) +).images[0] \ No newline at end of file From c962a7e8e02883824d504180f56188e203dd9c7d Mon Sep 17 00:00:00 2001 From: WangYi Date: Thu, 8 Aug 2024 10:02:22 +0800 Subject: [PATCH 3/4] add readme, add example for flux --- benchmarks/text_to_image.py | 26 +++-- .../examples/flux/README.md | 101 ++++++++++++++++++ .../examples/text_to_image_flux.py | 60 +++++++---- src/onediff/utils/import_utils.py | 5 +- 4 files changed, 160 insertions(+), 32 deletions(-) create mode 100644 onediff_diffusers_extensions/examples/flux/README.md diff --git a/benchmarks/text_to_image.py b/benchmarks/text_to_image.py index 8164c50ed..32013fcf6 100644 --- a/benchmarks/text_to_image.py +++ b/benchmarks/text_to_image.py @@ -46,7 +46,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default=MODEL) - parser.add_argument("--dtype", type=str, default="fp16") + parser.add_argument("--dtype", type=str, default="half") parser.add_argument("--variant", type=str, default=VARIANT) parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) parser.add_argument("--scheduler", type=str, default=SCHEDULER) @@ -93,7 +93,7 @@ def parse_args(): default=QUANTIZE_CONFIG, ) parser.add_argument("--quant-submodules-config-path", type=str, default=None) - parser.add_argument("--custom-revision", type=str, default=None) + parser.add_argument("--revision", type=str, default=None) parser.add_argument("--local-files-only", action="store_true") return parser.parse_args() @@ -111,7 +111,7 @@ def load_pipe( scheduler=None, lora=None, controlnet=None, - custom_revision=None, + revision=None, local_files_only=False, ): extra_kwargs = {} @@ -131,8 +131,8 @@ def load_pipe( torch_dtype=dtype, ) extra_kwargs["controlnet"] = controlnet - if custom_revision is not None: - extra_kwargs["custom_revision"] = custom_revision + if revision is not None: + extra_kwargs["revision"] = revision if local_files_only: extra_kwargs["local_files_only"] = True @@ -249,7 +249,7 @@ def main(): scheduler=args.scheduler, lora=args.lora, controlnet=args.controlnet, - custom_revision=args.custom_revision, + revision=args.revision, local_files_only=args.local_files_only, ) @@ -364,6 +364,13 @@ def get_kwarg_inputs(): kwarg_inputs["cache_block_id"] = args.cache_block_id return kwarg_inputs + kwarg_inputs = get_kwarg_inputs() + + # patch for flux pipeline, rename negative_prompt to prompt2 + if pipe.__class__.__name__ == "FluxPipeline": + kwarg_inputs["prompt_2"] = kwarg_inputs["negative_prompt"] + kwarg_inputs.pop("negative_prompt") + # NOTE: Warm it up. # The initial calls will trigger compilation and might be very slow. # After that, it should be very fast. @@ -372,7 +379,7 @@ def get_kwarg_inputs(): print("=======================================") print("Begin warmup") for _ in range(args.warmups): - pipe(**get_kwarg_inputs()) + pipe(**kwarg_inputs) end = time.time() print("End warmup") print(f"Warmup time: {end - begin:.3f}s") @@ -380,7 +387,7 @@ def get_kwarg_inputs(): # Let"s see it! # Note: Progress bar might work incorrectly due to the async nature of CUDA. - kwarg_inputs = get_kwarg_inputs() + iter_profiler = IterationProfiler() if "callback_on_step_end" in inspect.signature(pipe).parameters: kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end @@ -402,6 +409,9 @@ def get_kwarg_inputs(): else: cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3) print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB") + if args.compiler != "oneflow": + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Peak CUDA memory : {cuda_mem_max_reserved:.3f}GiB") print("=======================================") if args.print_output: diff --git a/onediff_diffusers_extensions/examples/flux/README.md b/onediff_diffusers_extensions/examples/flux/README.md new file mode 100644 index 000000000..033f12763 --- /dev/null +++ b/onediff_diffusers_extensions/examples/flux/README.md @@ -0,0 +1,101 @@ +# Run FLUX with nexfort backend (Beta Release) + +1. [Environment Setup](#environment-setup) + - [Set Up OneDiff](#set-up-onediff) + - [Set Up NexFort Backend](#set-up-nexfort-backend) + - [Set Up Diffusers Library](#set-up-diffusers) + - [Set Up FLUX](#set-up-flux) +2. [Execution Instructions](#run) + - [Run Without Compilation (Baseline)](#run-without-compilation-baseline) + - [Run With Compilation](#run-with-compilation) +3. [Performance Comparison](#performance-comparison) +4. [Dynamic Shape for FLUX](#dynamic-shape-for-flux) + +## Environment setup +### Set up onediff +https://github.com/siliconflow/onediff?tab=readme-ov-file#installation + +### Set up nexfort backend +https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort + +### Set up diffusers + +``` +pip3 install --upgrade diffusers[torch] +``` +### Set up FLUX +Model version for diffusers: https://huggingface.co/black-forest-labs/FLUX.1-schnell + +HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/flux.md + +## Run + +### Run without compilation (Baseline) +```shell +python3 benchmarks/text_to_image.py \ + --model black-forest-labs/FLUX.1-schnell \ + --height 1024 --width 1024 \ + --scheduler none \ + --steps 4 \ + --output-image ./flux-schnell.png \ + --prompt "beautiful scenery nature glass bottle landscape, , purple galaxy bottle," \ + --compiler none \ + --dtype bfloat16 \ + --seed 1 \ + --print-output +``` + +### Run with compilation + +```shell +python3 benchmarks/text_to_image.py \ + --model black-forest-labs/FLUX.1-schnell \ + --height 1024 --width 1024 \ + --scheduler none \ + --steps 4 \ + --output-image ./flux-schnell-compile.png \ + --prompt "beautiful scenery nature glass bottle landscape, , purple galaxy bottle," \ + --compiler nexfort \ + --compiler-config '{"mode": "benchmark:cudagraphs:max-autotune:low-precision:cache-all", "memory_format": "channels_last", "options": {"cuda.fuse_timestep_embedding": false, "inductor.force_triton_sdpa": true}}' \ + --dtype bfloat16 \ + --seed 1 \ + --print-output +``` + +## Performance comparison + +Testing on NVIDIA A800-SXM4-80GB, with image size of 1024*1024, iterating 4 steps: +| Metric | A800-SXM4-80GB 1024*1024 | +| ------------------------------------ | ------------------------ | +| Data update date (yyyy-mm-dd) | 2024-08-07 | +| PyTorch iteration speed | 2.18 it/s | +| OneDiff iteration speed | 2.80 it/s (+28.4%) | +| PyTorch E2E time | 2.06 s | +| OneDiff E2E time | 1.53 s (-25.7%) | +| PyTorch Max Mem Used | 35.79 GiB | +| OneDiff Max Mem Used | 40.44 GiB | +| PyTorch Warmup with Run time | 2.81 s | +| OneDiff Warmup with Compilation time | 253.01 s | +| OneDiff Warmup with Cache time | 73.63 s | + +1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU. + + +## Dynamic shape for FLUX + +Run: + +```shell +python3 benchmarks/text_to_image.py \ + --model black-forest-labs/FLUX.1-schnell \ + --height 1024 --width 1024 \ + --scheduler none \ + --steps 4 \ + --output-image ./flux-schnell-compile.png \ + --prompt "beautiful scenery nature glass bottle landscape, , purple galaxy bottle," \ + --compiler nexfort \ + --compiler-config '{"mode": "benchmark:cudagraphs:max-autotune:low-precision:cache-all", "memory_format": "channels_last", "options": {"cuda.fuse_timestep_embedding": false, "inductor.force_triton_sdpa": true}, "dynamic", true}' \ + --run_multiple_resolutions 1 \ + --dtype bfloat16 \ + --seed 1 \ +``` diff --git a/onediff_diffusers_extensions/examples/text_to_image_flux.py b/onediff_diffusers_extensions/examples/text_to_image_flux.py index 15ad0427d..245e46c38 100644 --- a/onediff_diffusers_extensions/examples/text_to_image_flux.py +++ b/onediff_diffusers_extensions/examples/text_to_image_flux.py @@ -18,32 +18,35 @@ parser.add_argument("--height", type=int, default=512) parser.add_argument("--width", type=int, default=512) parser.add_argument("--n_steps", type=int, default=4) -parser.add_argument( - "--saved_image", type=str, required=False, default="flux-out.png" -) +parser.add_argument("--saved_image", type=str, required=False, default="flux-out.png") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--warmup", type=int, default=1) parser.add_argument("--run", type=int, default=3) parser.add_argument( "--compile", type=(lambda x: str(x).lower() in ["true", "1", "yes"]), default=True ) +parser.add_argument("--run-multiple-resolutions", action="store_true") args = parser.parse_args() # load stable diffusion -import ipdb; ipdb.set_trace() -# pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16, local_files_only=True, custom_revision="93424e3") -pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16, custom_revision="93424e3a1530639fefdf08d2a7a954312e5cb254") -# pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16) -# pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float16) +pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16) +# pipe = FluxPipeline.from_pretrained(args.base, torch_dtype=torch.bfloat16, local_files_only=True, revision="93424e3a1530639fefdf08d2a7a954312e5cb254") pipe.to("cuda") -# import ipdb; ipdb.set_trace() - if args.compile: from onediffx import compile_pipe - pipe = compile_pipe(pipe, backend="nexfort") + pipe = compile_pipe( + pipe, + backend="nexfort", + options={ + "options": { + "cuda.fuse_timestep_embedding": False, + "inductor.force_triton_sdpa": True, + } + }, + ) # generate image @@ -56,8 +59,8 @@ height=args.height, width=args.width, output_type="pil", - num_inference_steps=args.n_steps, #use a larger number if you are using [dev] - generator=torch.Generator("cpu").manual_seed(args.seed) + num_inference_steps=args.n_steps, # use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed), ).images[0] @@ -69,19 +72,30 @@ height=args.height, width=args.width, output_type="pil", - num_inference_steps=args.n_steps, #use a larger number if you are using [dev] - generator=torch.Generator("cpu").manual_seed(args.seed) + num_inference_steps=args.n_steps, # use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed), ).images[0] end = time.time() print(f"Inference time: {end - begin:.3f}s") image.save(f"{i=}th_{args.saved_image}.png") -image = pipe( - args.prompt, - height=args.height // 2, - width=args.width // 2, - output_type="pil", - num_inference_steps=args.n_steps, #use a larger number if you are using [dev] - generator=torch.Generator("cpu").manual_seed(args.seed) -).images[0] \ No newline at end of file + +if args.run_multiple_resolutions: + print("Test run with multiple resolutions...") + sizes = [1024, 512, 768, 256] + for h in sizes: + for w in sizes: + print(f"Running at resolution: {h}x{w}") + start_time = time.time() + image = pipe( + args.prompt, + height=h, + width=w, + output_type="pil", + num_inference_steps=args.n_steps, # use a larger number if you are using [dev] + generator=torch.Generator("cpu").manual_seed(args.seed), + ).images[0] + end_time = time.time() + print(f"Inference time: {end_time - start_time:.2f} seconds") + image.save(f"{i=}th_{args.saved_image}_{h}x{w}.png") diff --git a/src/onediff/utils/import_utils.py b/src/onediff/utils/import_utils.py index 111387966..33d72046c 100644 --- a/src/onediff/utils/import_utils.py +++ b/src/onediff/utils/import_utils.py @@ -23,7 +23,7 @@ def check_module_availability(module_name): return True -_oneflow_available = check_module_availability("oneflow") +_oneflow_available = None _onediff_quant_available = check_module_availability("onediff_quant") _nexfort_available = check_module_availability("nexfort") @@ -33,6 +33,9 @@ def check_module_availability(module_name): def is_oneflow_available(): + global _oneflow_available + if _oneflow_available is None: + _oneflow_available = check_module_availability("oneflow") return _oneflow_available From 3091b5dcc6610b4b065cf4242cefe00d7462e454 Mon Sep 17 00:00:00 2001 From: Wang Yi <53533850+marigoold@users.noreply.github.com> Date: Thu, 8 Aug 2024 10:08:38 +0800 Subject: [PATCH 4/4] Update diffusion_pipeline_compiler.py --- .../onediffx/compilers/diffusion_pipeline_compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py b/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py index fba45330d..e7907f37f 100644 --- a/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py +++ b/onediff_diffusers_extensions/onediffx/compilers/diffusion_pipeline_compiler.py @@ -37,7 +37,6 @@ def _recursive_setattr(obj, attr, value): "vqgan.up_blocks", # for StableCascadeDecoderPipeline "vae.decoder", "vae.encoder", - "transformer", ]