microsoft · tianleiwu · Nov 22, 2024 · Dec 2, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -203,35 +203,55 @@ This step will export stable diffusion 1.5 to ONNX model in float32 using script
 
 ```
 curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py
-python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5  --output_path  ./sd_v1_5/fp32
+python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5  --output_path  ./sd1.5_onnx/fp32
 ```
 
 For SDXL, use optimum to export the model:
 ```
 pip install optimum diffusers onnx onnxruntime-gpu
-optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl ./sd_xl_base_onnx
+optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl ./sdxl_onnx/fp32
+```
+
+#### Stable Diffusion 3.x and Flux 1.0
+
+Stable Diffusion 3.x and Flux 1.0 requires transformers >= 4.45, and optimum > 1.23.3:
+```
+git clone https://github.com/huggingface/optimum
+cd optimum
+pip install -e .
+
+optimum-cli export onnx --model stabilityai/stable-diffusion-3-medium-diffusers ./sd3_onnx/fp32
+optimum-cli export onnx --model stabilityai/stable-diffusion-3.5-medium ./sd3.5_medium_onnx/fp32
+optimum-cli export onnx --model stabilityai/stable-diffusion-3.5-large ./sd3.5_large_onnx/fp32
+optimum-cli export onnx --model black-forest-labs/FLUX.1-schnell ./flux1_schnell_onnx/fp32
+optimum-cli export onnx --model black-forest-labs/FLUX.1-dev ./flux1_dev_onnx/fp32
 ```
 
 ### Optimize ONNX Pipeline
 
-Example to optimize the exported float32 ONNX models, and save to float16 models:
+Example to optimize the exported float32 ONNX models, then save to float16 models:
 ```
-python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd_v1_5/fp32 -o ./sd_v1_5/fp16 --float16
+python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd1.5_onnx/fp32 -o ./sd1.5_onnx/fp16 --float16
 ```
 
-In all examples below, we run the scripts in source code directory. You can get source code like the following:
+You can also run the script in source code directory like the following:
 ```
 git clone https://github.com/microsoft/onnxruntime
 cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion
+
+python optimize_pipeline.py -i ./sdxl_onnx/fp32 -o ./sdxl_onnx/fp16 --float16
+python optimize_pipeline.py -i ./sd3_onnx/fp32 -o ./sd3_onnx/fp16 --float16
+python optimize_pipeline.py -i ./sd3.5_medium_onnx/fp32 -o ./sd3.5_medium_onnx/fp16 --float16
+python optimize_pipeline.py -i ./flux1_schnell_onnx/fp32 -o ./flux1_schnell_onnx/fp16 --float16
+python optimize_pipeline.py -i ./flux1_dev_onnx/fp32 -o ./flux1_dev_onnx/fp16 --float16
 ```
 
 For SDXL model, it is recommended to use a machine with 48 GB or more memory to optimize.
-```
-python optimize_pipeline.py -i ./sd_xl_base_onnx -o ./sd_xl_base_fp16 --float16
-```
 
 ### Run Benchmark
 
+#### Run Benchmark with Optimum
+
 The benchmark.py script will run a warm-up prompt twice, and measure the peak GPU memory usage in these two runs, then record them as first_run_memory_MB and second_run_memory_MB. Then it will run 5 runs to get average latency (in seconds), and output the results to benchmark_result.csv.
 
 Note that the first run might need more time and memory: For example, cuDNN convolution algorithm search or model compile happens in the first run.
@@ -245,15 +265,15 @@ Before running benchmark on PyTorch, you need to be logged in via `huggingface-c
 
 Example to benchmark the optimized pipeline of stable diffusion 1.5 with batch size 1 on CUDA EP:
 ```
-python benchmark.py -p ./sd_v1_5/fp16 -b 1 -v 1.5
+python benchmark.py -p ./sd1.5_onnx/fp16 -b 1 -v 1.5
 python benchmark.py -b 1 -v 1.5
 ```
 For the first command, '-p' specifies a directory of optimized ONNX pipeline as generated by optimize_pipeline.py.
-For the second command without '-p', we will use OnnxruntimeCudaStableDiffusionPipeline to export and optimize ONNX models for clip, unet and vae decoder.
+For the second command without '-p', we will use ORTPipelineForText2Image to export and optimize ONNX models for clip, unet and vae decoder.
 
 On ROCm EP, use the following command instead:
 ```
-python benchmark.py -p ./sd_v1_5/fp16 -b 1 --tuning --provider rocm -v 1.5
+python benchmark.py -p ./sd1.5_onnx/fp16 -b 1 --tuning --provider rocm -v 1.5
 ```
 
 For ROCm EP, you can substitute `python benchmark.py` with `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` since
@@ -263,6 +283,13 @@ For ROCm EP, the `--tuning` is mandatory because we heavily rely on tuning to fi
 
 The default parameters are stable diffusion version=1.5, height=512, width=512, steps=50, batch_count=5. Run `python benchmark.py --help` for more information.
 
+#### Stable Diffusion 3.x and Flux 1.0
+Example of benchmark with optimum using CUDA provider on stable diffusion 3.5:
+```
+python benchmark.py -e optimum --height 1024 --width 1024 --steps 20 -b 1 -v 3.5M -p sd3.5_medium_onnx/fp32
+python benchmark.py -e optimum --height 1024 --width 1024 --steps 20 -b 1 -v 3.5M -p sd3.5_medium_onnx/fp16
+```
+
 ### Run Benchmark with xFormers
 
 Run PyTorch 1.13.1+cu117 with xFormers like the following

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -22,6 +22,11 @@
     "2.0": "stabilityai/stable-diffusion-2",
     "2.1": "stabilityai/stable-diffusion-2-1",
     "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
+    "3.0M": "stabilityai/stable-diffusion-3-medium-diffusers",
+    "3.5M": "stabilityai/stable-diffusion-3.5-medium",
+    "3.5L": "stabilityai/stable-diffusion-3.5-large",
+    "Flux.1S": "black-forest-labs/FLUX.1-schnell",
+    "Flux.1D": "black-forest-labs/FLUX.1-dev",
 }
 
 PROVIDERS = {
@@ -322,33 +327,12 @@ def get_optimum_ort_pipeline(
     disable_safety_checker: bool = True,
     use_io_binding: bool = False,
 ):
-    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
+    from optimum.onnxruntime import ORTPipelineForText2Image
 
     if directory is not None and os.path.exists(directory):
-        if "xl" in model_name:
-            pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
-                directory,
-                provider=provider,
-                session_options=None,
-                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
-            )
-        else:
-            pipeline = ORTStableDiffusionPipeline.from_pretrained(
-                directory,
-                provider=provider,
-                use_io_binding=use_io_binding,
-            )
-    elif "xl" in model_name:
-        pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
-            model_name,
-            export=True,
-            provider=provider,
-            session_options=None,
-            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
-        )
-        pipeline.save_pretrained(directory)
+        pipeline = ORTPipelineForText2Image.from_pretrained(directory, provider=provider, use_io_binding=use_io_binding)
     else:
-        pipeline = ORTStableDiffusionPipeline.from_pretrained(
+        pipeline = ORTPipelineForText2Image.from_pretrained(
             model_name,
             export=True,
             provider=provider,
@@ -376,10 +360,7 @@ def run_optimum_ort_pipeline(
     memory_monitor_type,
     use_num_images_per_prompt=False,
 ):
-    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
-
-    assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
-
+    print("Pipeline type", type(pipe))
     prompts, negative_prompt = example_prompts()
 
     def warmup():

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@@ -27,6 +27,7 @@
 import coloredlogs
 import onnx
 from fusion_options import FusionOptions
+from onnx_model_mmdit import MmditOnnxModel
 from onnx_model_clip import ClipOnnxModel
 from onnx_model_unet import UnetOnnxModel
 from onnx_model_vae import VaeOnnxModel
@@ -46,9 +47,20 @@ def has_external_data(onnx_model_path):
     return False
 
 
+def _get_model_list(source_dir: Path):
+    is_xl = (source_dir / "text_encoder_2").exists()
+    is_sd3 = (source_dir / "text_encoder_3").exists()
+    model_list_sd3 = ["text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "vae_encoder", "vae_decoder"]
+    model_list_sdxl = ["text_encoder", "text_encoder_2", "unet", "vae_encoder", "vae_decoder"]
+    model_list_sd = ["text_encoder", "unet", "vae_encoder", "vae_decoder"]
+    model_list = model_list_sd3 if is_sd3 else (model_list_sdxl if is_xl else model_list_sd)
+    return model_list
+
+
 def _optimize_sd_pipeline(
     source_dir: Path,
     target_dir: Path,
+    model_list: List[str],
     use_external_data_format: Optional[bool],
     float16: bool,
     force_fp32_ops: List[str],
@@ -60,6 +72,7 @@ def _optimize_sd_pipeline(
     Args:
         source_dir (Path): Root of input directory of stable diffusion onnx pipeline with float32 models.
         target_dir (Path): Root of output directory of stable diffusion onnx pipeline with optimized models.
+        model_list (List[str]): list of directory names with onnx model.
         use_external_data_format (Optional[bool]): use external data format.
         float16 (bool): use half precision
         force_fp32_ops(List[str]): operators that are forced to run in float32.
@@ -70,18 +83,21 @@ def _optimize_sd_pipeline(
         RuntimeError: output onnx model path existed
     """
     model_type_mapping = {
+        "transformer": "mmdit",
         "unet": "unet",
         "vae_encoder": "vae",
         "vae_decoder": "vae",
         "text_encoder": "clip",
         "text_encoder_2": "clip",
         "safety_checker": "unet",
+        "text_encoder_3": "clip",
     }
 
     model_type_class_mapping = {
         "unet": UnetOnnxModel,
         "vae": VaeOnnxModel,
         "clip": ClipOnnxModel,
+        "mmdit": MmditOnnxModel,
     }
 
     force_fp32_operators = {
@@ -91,10 +107,10 @@ def _optimize_sd_pipeline(
         "text_encoder": [],
         "text_encoder_2": [],
         "safety_checker": [],
+        "text_encoder_3": [],
+        "transformer": [],
     }
 
-    is_xl = (source_dir / "text_encoder_2").exists()
-
     if force_fp32_ops:
         for fp32_operator in force_fp32_ops:
             parts = fp32_operator.split(":")
@@ -108,8 +124,8 @@ def _optimize_sd_pipeline(
     for name, model_type in model_type_mapping.items():
         onnx_model_path = source_dir / name / "model.onnx"
         if not os.path.exists(onnx_model_path):
-            if name != "safety_checker":
-                logger.info("input onnx model does not exist: %s", onnx_model_path)
+            if name != "safety_checker" and name in model_list:
+                logger.warning("input onnx model does not exist: %s", onnx_model_path)
             # some model are optional so we do not raise error here.
             continue
 
@@ -122,7 +138,7 @@ def _optimize_sd_pipeline(
             use_external_data_format = has_external_data(onnx_model_path)
 
         # Graph fusion before fp16 conversion, otherwise they cannot be fused later.
-        logger.info(f"Optimize {onnx_model_path}...")
+        logger.info("Optimize %s ...", onnx_model_path)
 
         args.model_type = model_type
         fusion_options = FusionOptions.parse(args)
@@ -147,6 +163,7 @@ def _optimize_sd_pipeline(
 
         if float16:
             # For SD-XL, use FP16 in VAE decoder will cause NaN and black image so we keep it in FP32.
+            is_xl = (source_dir / "text_encoder_2").exists()
             if is_xl and name == "vae_decoder":
                 logger.info("Skip converting %s to float16 to avoid NaN", name)
             else:
@@ -181,17 +198,18 @@ def _optimize_sd_pipeline(
         logger.info("*" * 20)
 
 
-def _copy_extra_directory(source_dir: Path, target_dir: Path):
+def _copy_extra_directory(source_dir: Path, target_dir: Path, model_list: List[str]):
     """Copy extra directory that does not have onnx model
 
     Args:
         source_dir (Path): source directory
         target_dir (Path): target directory
+        model_list (List[str]): list of directory names with onnx model.
 
     Raises:
         RuntimeError: source path does not exist
     """
-    extra_dirs = ["scheduler", "tokenizer", "tokenizer_2", "feature_extractor"]
+    extra_dirs = ["scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"]
 
     for name in extra_dirs:
         source_path = source_dir / name
@@ -213,8 +231,7 @@ def _copy_extra_directory(source_dir: Path, target_dir: Path):
         logger.info("%s => %s", source_path, target_path)
 
     # Some directory are optional
-    onnx_model_dirs = ["text_encoder", "text_encoder_2", "unet", "vae_encoder", "vae_decoder", "safety_checker"]
-    for onnx_model_dir in onnx_model_dirs:
+    for onnx_model_dir in model_list:
         source_path = source_dir / onnx_model_dir / "config.json"
         target_path = target_dir / onnx_model_dir / "config.json"
         if source_path.exists():
@@ -236,17 +253,20 @@ def optimize_stable_diffusion_pipeline(
         if overwrite:
             shutil.rmtree(output_dir, ignore_errors=True)
         else:
-            raise RuntimeError("output directory existed:{output_dir}. Add --overwrite to empty the directory.")
+            raise RuntimeError(f"output directory existed:{output_dir}. Add --overwrite to empty the directory.")
 
     source_dir = Path(input_dir)
     target_dir = Path(output_dir)
     target_dir.mkdir(parents=True, exist_ok=True)
 
-    _copy_extra_directory(source_dir, target_dir)
+    model_list = _get_model_list(source_dir)
+
+    _copy_extra_directory(source_dir, target_dir, model_list)
 
     _optimize_sd_pipeline(
         source_dir,
         target_dir,
+        model_list,
         use_external_data_format,
         float16,
         args.force_fp32_ops,