From 38e8b65afafe5aa36e1f258268efa9c7a4405bb5 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Mon, 29 Jan 2024 14:16:50 +0800
Subject: [PATCH] save stablediffusion and open-clip in pipeline cache

---
 .../models/stable_diffusion/demo_utils.py     |  6 +++-
 .../models/stable_diffusion/engine_builder.py |  6 ++--
 .../stable_diffusion/test/check_image.py      | 15 ++++----
 .../azure-pipelines/bigmodels-ci-pipeline.yml | 35 ++++++++++++++++---
 4 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
index 32c673416fce2..7bbaf195fc0fe 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
@@ -242,6 +242,8 @@ def parse_arguments(is_xl: bool, parser):
     parser.add_argument("--deterministic", action="store_true", help="use deterministic algorithms.")
     parser.add_argument("-dc", "--disable-cuda-graph", action="store_true", help="Disable cuda graph.")
 
+    parser.add_argument("--framework-model-dir", default=None, help="framework model directory")
+
     group = parser.add_argument_group("Options for ORT_CUDA engine only")
     group.add_argument("--enable-vae-slicing", action="store_true", help="True will feed only one image to VAE once.")
 
@@ -406,6 +408,7 @@ def initialize_pipeline(
     lora_scale=1.0,
     use_fp16_vae=True,
     use_vae=True,
+    framework_model_dir=None,
 ):
     pipeline_info = PipelineInfo(
         version,
@@ -425,7 +428,7 @@ def initialize_pipeline(
     input_engine_dir = engine_dir
 
     onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
-        work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type
+        work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type, framework_model_dir=framework_model_dir
     )
 
     pipeline = StableDiffusionPipeline(
@@ -558,6 +561,7 @@ def load_pipelines(args, batch_size=None):
         "lora_scale": args.lora_scale,
         "use_fp16_vae": "xl" in args.version,
         "use_vae": True,
+        "framework_model_dir": args.framework_model_dir
     }
 
     if "xl" in args.version:
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
index 46a83f5dc228d..f9af78fc4d288 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
@@ -5,6 +5,7 @@
 import hashlib
 import os
 from enum import Enum
+from typing import Optional
 
 import torch
 from diffusion_models import CLIP, VAE, CLIPWithProj, PipelineInfo, UNet, UNetXL
@@ -273,7 +274,7 @@ def vae_decode(self, latents):
         return self._vae_decode(latents)
 
 
-def get_engine_paths(work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType):
+def get_engine_paths(work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType, framework_model_dir: Optional[str] = None):
     root_dir = work_dir or "."
     short_name = pipeline_info.short_name()
 
@@ -287,6 +288,7 @@ def get_engine_paths(work_dir: str, pipeline_info: PipelineInfo, engine_type: En
 
     # Shared among ORT_CUDA, ORT_TRT and TRT engines, and need use load_model(..., always_download_fp16=True)
     # So that the shared model is always fp16.
-    framework_model_dir = os.path.join(root_dir, "torch_model")
+    if framework_model_dir is None:
+        framework_model_dir = os.path.join(root_dir, "torch_model")
 
     return onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
index fcfe8b081fb0a..5c9e9edefbb7c 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from typing import Optional
 
 import cv2
 import open_clip
@@ -12,13 +13,14 @@ def arg_parser():
     parser = argparse.ArgumentParser(description="Options for Compare 2 image")
     parser.add_argument("--image1", type=str, help="Path to image 1")
     parser.add_argument("--image2", type=str, help="Path to image 2")
+    parser.add_argument("--cache_dir", type=str, help="Path to model cache directory")
     args = parser.parse_args()
     return args
 
 
-def image_encoder(img: Image.Image):  # -> torch.Tensor:
+def image_encoder(img: Image.Image, cache_dir: Optional[str] = None):  # -> torch.Tensor:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-16-plus-240", pretrained="laion400m_e32")
+    model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-16-plus-240", pretrained="laion400m_e32", cache_dir=cache_dir)
     model.to(device)
 
     img1 = Image.fromarray(img).convert("RGB")
@@ -41,11 +43,11 @@ def load_image(image_path: str):  # -> Image.Image:
     return img
 
 
-def generate_score(image1: str, image2: str):  # -> float:
+def generate_score(image1: str, image2: str, cache_dir: Optional[str] = None):  # -> float:
     test_img = load_image(image1)
     data_img = load_image(image2)
-    img1 = image_encoder(test_img)
-    img2 = image_encoder(data_img)
+    img1 = image_encoder(test_img, cache_dir)
+    img2 = image_encoder(data_img, cache_dir)
     cos_scores = util.pytorch_cos_sim(img1, img2)
     score = round(float(cos_scores[0][0]) * 100, 2)
     return score
@@ -55,7 +57,8 @@ def main():
     args = arg_parser()
     image1 = args.image1
     image2 = args.image2
-    score = round(generate_score(image1, image2), 2)
+    cache_dir = args.cache_dir
+    score = round(generate_score(image1, image2, cache_dir), 2)
     print("similarity Score: ", {score})
     if score < 99:
         print(f"{image1} and {image2} are different")
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index dd88a4d6d5632..11b3d61e6fa4c 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -172,6 +172,8 @@ stages:
   - job: Stable_Diffusion
     variables:
       skipComponentGovernanceDetection: true
+      CLIP_MODEL_CACHE: $(Agent.TempDirectory)/clip_cache
+      STABLE_DIFFUSION_MODEL_CACHE: $(Agent.TempDirectory)/stablediffusion_cache
     workspace:
       clean: all
     pool: onnxruntime-Linux-GPU-A10-12G
@@ -188,9 +190,21 @@ stages:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
+    - task: Cache@2
+      inputs:
+        key: stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+        restoreKeys: |
+          stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+          stable_diffusion
+        path: $(STABLE_DIFFUSION_MODEL_CACHE)
+      displayName: Cache stable diffusion model
+
     - script: |
-        docker run --rm --gpus all -v $PWD:/workspace -v $(Build.BinariesDirectory)/Release:/Release nvcr.io/nvidia/pytorch:22.11-py3 \
-          bash -c '
+        docker run --rm --gpus all -v $PWD:/workspace \
+          -v $(Build.BinariesDirectory)/Release:/Release \
+          -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \
+          nvcr.io/nvidia/pytorch:22.11-py3 \
+          bash -c ' \
             set -ex; \
             python3 --version; \
             python3 -m pip install --upgrade pip; \
@@ -199,15 +213,26 @@ stages:
             python3 -m pip install -r requirements-cuda11.txt; \
             python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com; \
             echo Generate an image guided by a text prompt; \
-            python3 demo_txt2img.py --seed 1 --deterministic "astronaut riding a horse on mars" ; \
+            python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \
             find $(pwd) -name "*.png" ; \
             popd ; \
             '
       displayName: 'Run stable diffusion demo'
       workingDirectory: $(Build.SourcesDirectory)
 
+    - task: Cache@2
+      inputs:
+        key: '"clip_model" | "2.24.0"'
+        restoreKeys: |
+          "clip_model" | "2.24.0"
+          "clip_model"
+        path: $(CLIP_MODEL_CACHE)
+      displayName: Cache clip model
+
     - script: |
-        docker run --rm --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:22.11-py3 \
+        docker run --rm --gpus all -v $PWD:/workspace \
+          -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
+          nvcr.io/nvidia/pytorch:22.11-py3 \
           bash -c '
             set -ex; \
             python3 --version; \
@@ -217,7 +242,7 @@ stages:
             pushd test; \
             python3 -m pip install -r requirements.txt; \
             echo check demo_txt2image.py generate image; \
-            python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2; \
+            python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2 --cache_dir /model_cache ; \
             popd ; \
             popd ; \
             '