siliconflow · XuZhang99 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+set -e
+
+# indicate which model to run
+# e.g.  ./run_benchmark.sh sd15,sd21,sdxl or ./run_benchmark.sh all
+run_model=$1
+
+
+
+# set environment variables
+export NEXFORT_GRAPH_CACHE=1
+export NEXFORT_FX_FORCE_TRITON_SDPA=1
+
+
+# model path
+model_dir="/data1/hf_model"
+sd15_path="${model_dir}/stable-diffusion-v1-5"
+sd21_path="${model_dir}/stable-diffusion-2-1"
+sdxl_path="${model_dir}/stable-diffusion-xl-base-1.0"
+sd3_path="/data1/home/zhangxu/stable-diffusion-3-medium-diffusers"
+flux_dev_path="${model_dir}/FLUX.1-dev/snapshots/0ef5fff789c832c5c7f4e127f94c8b54bbcced44"
+flux_schnell_path="${model_dir}/FLUX.1-schnell"
+
+# get current time
+current_time=$(date +"%Y-%m-%d")
+echo "Current time: ${current_time}"
+
+# get NVIDIA GPU name
+gpu_name=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader,nounits | head -n 1 | sed 's/NVIDIA //; s/ /_/g')
+
+# table header
+BENCHMARK_RESULT_TEXT="| Data update date (yyyy-mm-dd) | GPU | Model | HxW | Compiler | Quantization | Iteration speed (it/s) | E2E Time (s) | Max used CUDA memory (GiB) | Warmup time (s) |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"
+
+
+prompt="beautiful scenery nature glass bottle landscape, purple galaxy bottle"
+quantize_config='{"quant_type": "fp8_e4m3_e4m3_dynamic_per_tensor"}'
+
+# oneflow 没有compiler_config
+#sd15_nexfort_compiler_config=""
+#sd21_nexfort_compiler_config=""
+#sdxl_nexfort_compiler_config=""
+
+sd3_nexfort_compiler_config='{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}'
+flux_nexfort_compiler_config='{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}'
+
+
+# benchmark model with one resolution function
+benchmark_model_with_one_resolution() {
+  # model_name is the name of the model
+  model_name=$1
+  # model_path is the path of the model
+  model_path=$2
+  # steps is the number of inference steps
+  steps=$3
+  # compiler is the compiler used, e.g. none, oneflow, nexfort, transform
+  compiler=$4
+  # compiler_config is the compiler config used
+  compiler_config=$5
+  # height and width are the resolution of the image
+  height=$6
+  width=$7
+  # quantize is whether to quantize
+  quantize=$8
+
+  echo "Running ${model_path} ${height}x${width}..."
+
+  # if model_name contains sd3, use sd3 script
+  if [[ "${model_name}" =~ sd3 ]]; then
+    script_path="onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py"
+  # if model_name contains flux, use flux script
+  elif [[ "${model_name}" =~ flux ]]; then
+    script_path="onediff_diffusers_extensions/examples/flux/text_to_image_flux.py"
+  else
+  # otherwise, use sd script
+    script_path="benchmarks/text_to_image.py"
+  fi
+
+  # if quantize is True, add --quantize and --quantize-config
+  if [[ ${quantize} == True ]]; then
+    script_output=$(python3 ${script_path} \
+      --model ${model_path} --variant fp16 --steps ${steps} \
+      --height ${height} --width ${width} --seed 1 \
+      --compiler ${compiler} --compiler-config "${compiler_config}" \
+      --quantize --quantize-config "${quantize_config}" \
+      --prompt "${prompt}" --print-output | tee /dev/tty)
+  else
+    script_output=$(python3 ${script_path} \
+      --model ${model_path} --variant fp16 --steps ${steps} \
+      --height ${height} --width ${width} --seed 1 \
+      --compiler ${compiler} --compiler-config "${compiler_config}" \
+      --prompt "${prompt}" --print-output | tee /dev/tty)
+  fi
+
+  # get inference time, iterations per second, max used cuda memory, warmup time
+  inference_time=$(echo "${script_output}" | grep -oP '(?<=Inference time: )\d+\.\d+')
+  iterations_per_second=$(echo "${script_output}" | grep -oP '(?<=Iterations per second: )\d+\.\d+')
+  max_used_cuda_memory=$(echo "${script_output}" | grep -oP '(?<=Max used CUDA memory : )\d+\.\d+')
+  warmup_time=$(echo "${script_output}" | grep -oP '(?<=Warmup time: )\d+\.\d+')
+
+  # add benchmark result to BENCHMARK_RESULT_TEXT
+  BENCHMARK_RESULT_TEXT="${BENCHMARK_RESULT_TEXT}| "${current_time}" | "${gpu_name}" | "${model_name}" | ${height}x${width} | ${compiler} | ${quantize} | ${iterations_per_second} | ${inference_time} | ${max_used_cuda_memory} | ${warmup_time} |\n"
+}
+
+# conda init
+source ~/miniconda3/etc/profile.d/conda.sh
+
+#########################################
+# if run_model contains sd15 or all, run sd15
+if [[ "${run_model}" =~ sd15|all ]]; then
+  conda activate oneflow
+  benchmark_model_with_one_resolution sd15 ${sd15_path} 30 none none 512 512 False
+  benchmark_model_with_one_resolution sd15 ${sd15_path} 30 oneflow none 512 512 False
+  benchmark_model_with_one_resolution sd15 ${sd15_path} 30 oneflow none 512 512 True
+fi
+
+# if run_model contains sd21 or all, run sd21
+if [[ "${run_model}" =~ sd21|all ]]; then
+  # activate oneflow environment
+  conda activate oneflow
+  benchmark_model_with_one_resolution sd21 ${sd21_path} 20 none none 768 768 False
+  benchmark_model_with_one_resolution sd21 ${sd21_path} 20 oneflow none 768 768 False
+  benchmark_model_with_one_resolution sd21 ${sd21_path} 20 oneflow none 768 768 True
+fi
+
+# if run_model contains sdxl or all, run sdxl
+if [[ "${run_model}" =~ sdxl|all ]]; then
+  # activate oneflow environment
+  conda activate oneflow
+  benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 none none 1024 1024 False
+  benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 oneflow none 1024 1024 False
+  benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 oneflow none 1024 1024 True
+fi
+#########################################
+
+#########################################
+# if run_model contains sd3 or all, run sd3
+if [[ "${run_model}" =~ sd3|all ]]; then
+  conda activate nexfort
+  # activate nexfort environment
+  benchmark_model_with_one_resolution sd3 ${sd3_path} 28 none none 1024 1024 False
+  benchmark_model_with_one_resolution sd3 ${sd3_path} 28 nexfort "${sd3_nexfort_compiler_config}" 1024 1024 False
+  benchmark_model_with_one_resolution sd3 ${sd3_path} 28 nexfort "${sd3_nexfort_compiler_config}" 1024 1024 True
+fi
+
+# if run_model contains flux or all, run flux
+if [[ "${run_model}" =~ flux|all ]]; then
+  # activate nexfort environment
+  conda activate nexfort
+  benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 none none 1024 1024 False
+  benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 nexfort "${flux_nexfort_compiler_config}" 1024 1024 False
+  benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 nexfort "${flux_nexfort_compiler_config}" 1024 1024 True
+  benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 transform none 1024 1024 False
+
+
+  benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 none none 1024 1024 False
+  benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 nexfort "${flux_nexfort_compiler_config}" 1024 1024 False
+  benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 nexfort "${flux_nexfort_compiler_config}" 1024 1024 True
+  benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 transform none 1024 1024 False
+fi
+#########################################
+
+
+echo -e "\nBenchmark Results:"
+# print benchmark result and add benchmark result to markdown file
+echo -e ${BENCHMARK_RESULT_TEXT} | tee -a benchmark_result_"${gpu_name}".md
+echo -e "\nBenchmark Done!"
diff --git a/benchmarks/text_to_image.py b/benchmarks/text_to_image.py
@@ -35,6 +35,7 @@
 import torch
 from diffusers.utils import load_image
 from onediff.infer_compiler import oneflow_compile
+from onediff.optimization.quant_optimizer import quantize_model
 
 from onediffx import (  # quantize_pipe currently only supports the nexfort backend.
     compile_pipe,
@@ -252,6 +253,13 @@ def main():
         print("Oneflow backend is now active...")
         # Note: The compile_pipe() based on the oneflow backend is incompatible with T5EncoderModel.
         # pipe = compile_pipe(pipe)
+
+        if args.quantize: 
+            if hasattr(pipe, "unet"):
+                pipe.unet = quantize_model(pipe.unet)
+            if hasattr(pipe, "transformer"):
+                pipe.transformer = quantize_model(pipe.transformer)
+
         if hasattr(pipe, "unet"):
             pipe.unet = oneflow_compile(pipe.unet)
         if hasattr(pipe, "transformer"):

diff --git a/onediff_diffusers_extensions/examples/flux/README.md b/onediff_diffusers_extensions/examples/flux/README.md
@@ -0,0 +1,129 @@
+# Run Flux with onediff
+
+
+## Environment setup
+
+### Set up onediff
+https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
+
+### Set up compiler backend
+Support two backends: oneflow and nexfort.
+
+https://github.com/siliconflow/onediff?tab=readme-ov-file#install-a-compiler-backend
+
+### Set up flux
+HF model: https://huggingface.co/black-forest-labs/FLUX.1-dev  and https://huggingface.co/black-forest-labs/FLUX.1-schnell
+
+HF pipeline: https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux
+
+### Set up others
+Install extra pkgs and set environment variable.
+```bash
+pip install --upgrade transformers
+pip install --upgrade diffusers[torch]
+pip install nvidia-cublas-cu12==12.4.5.8
+
+export NEXFORT_FX_FORCE_TRITON_SDPA=1
+```
+
+## Run
+
+### Run FLUX.1-dev 1024*1024 without compile (the original pytorch HF diffusers baseline)
+```
+python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
+--model black-forest-labs/FLUX.1-dev \
+--height 1024 \
+--width  1024 \
+--steps 20 \
+--seed 1 \
+--output-image ./flux.png
+```
+
+### Run FLUX.1-dev 1024*1024 with compile [nexfort backend]
+
+```
+python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
+--model black-forest-labs/FLUX.1-dev \
+--height 1024 \
+--width  1024 \
+--steps 20 \
+--seed 1 \
+--compiler nexfort \
+--compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \
+--output-image ./flux_nexfort_compile.png
+```
+
+
+### Run FLUX.1-schnell 1024*1024 without compile (the original pytorch HF diffusers baseline)
+```
+python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
+--model black-forest-labs/FLUX.1-schnell \
+--height 1024 \
+--width  1024 \
+--steps 4 \
+--seed 1 \
+--output-image ./flux.png
+```
+
+### Run FLUX.1-schnell 1024*1024 with compile [nexfort backend]
+
+```
+python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
+--model black-forest-labs/FLUX.1-schnell \
+--height 1024 \
+--width  1024 \
+--steps 4 \
+--seed 1 \
+--compiler nexfort \
+--compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \
+--output-image ./flux_nexfort_compile.png
+```
+
+
+## FLUX.1-dev Performance comparation
+**Testing on NVIDIA H20-SXM4-80GB:**
+
+Data update date: 2024-10-23
+
+| Framework          | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>1</sup> | Warmup with Cache time (seconds) |
+|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
+| PyTorch            | 1.30                  | 15.72               | 35.73                 | 16.68       | -                      |
+| OneDiff (NexFort)  | 1.76 (+35.4%)         | 11.57 (-26.4%)      | 34.85                | 750.78      | 28.57                  |
+
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468V.
+
+**Testing on NVIDIA L20-SXM4-48GB:**
+
+Data update date: 2024-10-28
+
+| Framework          | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>2</sup> | Warmup with Cache time (seconds) |
+|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
+| PyTorch            | 1.10                   | 18.45               | 35.71                | 18.695        | -                      |
+| OneDiff (NexFort)  | 1.41 (+28.2%)         | 14.44 (-21.7%)      | 34.83                | 546.52      | 25.32                  |
+
+ <sup>2</sup> OneDiff Warmup with Compilation time is tested on AMD EPYC 9354 32-Core Processor.
+
+
+
+## FLUX.1-schnell Performance comparation
+**Testing on NVIDIA H20-SXM4-80GB:**
+
+Data update date: 2024-10-23
+
+| Framework          | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>1</sup> | Warmup with Cache time (seconds) |
+|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
+| PyTorch            | 1.30             | 3.38              | 35.71               | 4.35      | -                      |
+| OneDiff (NexFort)  | 1.75 (+34.6%)         | 2.46 (-27.2%)      | 34.83             | 201.41      | 19.57                 |
+
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468V.
+
+**Testing on NVIDIA L20-SXM4-48GB:**
+
+Data update date: 2024-10-28
+
+| Framework          | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>2</sup> | Warmup with Cache time (seconds) |
+|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
+| PyTorch            | 1.10                   | 3.94               | 35.69                | 4.15        | -                      |
+| OneDiff (NexFort)  | 1.41 (+28.2%)         | 3.03 (-23.1%)      | 34.81                | 145.63      | 13.56                  |
+
+ <sup>2</sup> OneDiff Warmup with Compilation time is tested on AMD EPYC 9354 32-Core Processor.