From 02592a554366e7114a48e3dc7c35b7a4f523c168 Mon Sep 17 00:00:00 2001
From: Yondon Fu <yondon.fu@gmail.com>
Date: Tue, 6 Feb 2024 16:33:42 +0000
Subject: [PATCH] runner: Support stable-fast using env var

---
 runner/Dockerfile                      |  4 +++-
 runner/README.md                       |  6 +++++-
 runner/app/pipelines/image_to_image.py | 10 +++++++++
 runner/app/pipelines/image_to_video.py | 10 +++++++++
 runner/app/pipelines/sfast.py          | 29 ++++++++++++++++++++++++++
 runner/app/pipelines/text_to_image.py  | 10 +++++++++
 runner/requirements.txt                |  4 +++-
 7 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 runner/app/pipelines/sfast.py

diff --git a/runner/Dockerfile b/runner/Dockerfile
index d28f1974..8d70bb76 100644
--- a/runner/Dockerfile
+++ b/runner/Dockerfile
@@ -30,12 +30,14 @@ RUN pyenv install $PYTHON_VERSION && \
 # Upgrade pip and install your desired packages
 ARG PIP_VERSION=23.3.2
 RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools wheel && \
-  pip install --no-cache-dir torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+  pip install --no-cache-dir torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1
 
 WORKDIR /app
 COPY ./requirements.txt /app
 RUN pip install --no-cache-dir -r requirements.txt
 
+RUN pip install https://github.com/chengzeyi/stable-fast/releases/download/v1.0.3/stable_fast-1.0.3+torch211cu121-cp311-cp311-manylinux2014_x86_64.whl
+
 # Most DL models are quite large in terms of memory, using workers is a HUGE
 # slowdown because of the fork and GIL with python.
 # Using multiple pods seems like a better default strategy.
diff --git a/runner/README.md b/runner/README.md
index a25da647..9196b2c8 100644
--- a/runner/README.md
+++ b/runner/README.md
@@ -3,7 +3,7 @@
 ## Build Docker image
 
 ```
-docker build -t runner .
+docker build -t livepeer/ai-runner:latest .
 ```
 
 ## Download models
@@ -19,6 +19,10 @@ pip install "huggingface_hub[cli]"
 ./dl-checkpoints.sh
 ```
 
+## Optimizations
+
+- Set the environment variable `SFAST=true` to enable dynamic compilation with [stable-fast](https://github.com/chengzeyi/stable-fast) to speed up inference for diffusion pipelines (the initial requests will be slower because the model will be dynamically compiled then).
+
 ## Run text-to-image container
 
 Run container:
diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py
index 2cd50720..2e009a83 100644
--- a/runner/app/pipelines/image_to_image.py
+++ b/runner/app/pipelines/image_to_image.py
@@ -7,6 +7,7 @@
 import PIL
 from typing import List
 import logging
+import os
 
 from PIL import ImageFile
 
@@ -34,6 +35,15 @@ def __init__(self, model_id: str):
         self.ldm = AutoPipelineForImage2Image.from_pretrained(model_id, **kwargs)
         self.ldm.to(get_torch_device())
 
+        if os.environ.get("SFAST"):
+            logger.info(
+                "ImageToImagePipeline will be dynamicallly compiled with stable-fast for %s",
+                model_id,
+            )
+            from app.pipelines.sfast import compile_model
+
+            self.ldm = compile_model(self.ldm)
+
     def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]:
         seed = kwargs.pop("seed", None)
         if seed is not None:
diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py
index d73cba33..b6308823 100644
--- a/runner/app/pipelines/image_to_video.py
+++ b/runner/app/pipelines/image_to_video.py
@@ -7,6 +7,7 @@
 import PIL
 from typing import List
 import logging
+import os
 
 from PIL import ImageFile
 
@@ -34,6 +35,15 @@ def __init__(self, model_id: str):
         self.ldm = StableVideoDiffusionPipeline.from_pretrained(model_id, **kwargs)
         self.ldm.to(get_torch_device())
 
+        if os.environ.get("SFAST"):
+            logger.info(
+                "ImageToVideoPipeline will be dynamicallly compiled with stable-fast for %s",
+                model_id,
+            )
+            from app.pipelines.sfast import compile_model
+
+            self.ldm = compile_model(self.ldm)
+
     def __call__(self, image: PIL.Image, **kwargs) -> List[List[PIL.Image]]:
         if "decode_chunk_size" not in kwargs:
             kwargs["decode_chunk_size"] = 8
diff --git a/runner/app/pipelines/sfast.py b/runner/app/pipelines/sfast.py
new file mode 100644
index 00000000..c9b450f2
--- /dev/null
+++ b/runner/app/pipelines/sfast.py
@@ -0,0 +1,29 @@
+from sfast.compilers.diffusion_pipeline_compiler import compile, CompilationConfig
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def compile_model(model):
+    config = CompilationConfig.Default()
+
+    # xformers and Triton are suggested for achieving best performance.
+    # It might be slow for Triton to generate, compile and fine-tune kernels.
+    try:
+        import xformers
+
+        config.enable_xformers = True
+    except ImportError:
+        logger.info("xformers not installed, skip")
+    # NOTE:
+    # When GPU VRAM is insufficient or the architecture is too old, Triton might be slow.
+    # Disable Triton if you encounter this problem.
+    try:
+        import triton
+
+        config.enable_triton = True
+    except ImportError:
+        logger.info("Triton not installed, skip")
+
+    model = compile(model, config)
+    return model
diff --git a/runner/app/pipelines/text_to_image.py b/runner/app/pipelines/text_to_image.py
index 25480ff2..eb6b5292 100644
--- a/runner/app/pipelines/text_to_image.py
+++ b/runner/app/pipelines/text_to_image.py
@@ -7,6 +7,7 @@
 import PIL
 from typing import List
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -30,6 +31,15 @@ def __init__(self, model_id: str):
         self.ldm = AutoPipelineForText2Image.from_pretrained(model_id, **kwargs)
         self.ldm.to(get_torch_device())
 
+        if os.environ.get("SFAST"):
+            logger.info(
+                "TextToImagePipeline will be dynamicallly compiled with stable-fast for %s",
+                model_id,
+            )
+            from app.pipelines.sfast import compile_model
+
+            self.ldm = compile_model(self.ldm)
+
     def __call__(self, prompt: str, **kwargs) -> List[PIL.Image]:
         seed = kwargs.pop("seed", None)
         if seed is not None:
diff --git a/runner/requirements.txt b/runner/requirements.txt
index 0f39df9f..cc953d5f 100644
--- a/runner/requirements.txt
+++ b/runner/requirements.txt
@@ -6,4 +6,6 @@ pydantic
 Pillow
 python-multipart
 uvicorn
-huggingface_hub
\ No newline at end of file
+huggingface_hub
+xformers==0.0.23
+triton>=2.1.0
\ No newline at end of file