From 02592a554366e7114a48e3dc7c35b7a4f523c168 Mon Sep 17 00:00:00 2001 From: Yondon Fu Date: Tue, 6 Feb 2024 16:33:42 +0000 Subject: [PATCH] runner: Support stable-fast using env var --- runner/Dockerfile | 4 +++- runner/README.md | 6 +++++- runner/app/pipelines/image_to_image.py | 10 +++++++++ runner/app/pipelines/image_to_video.py | 10 +++++++++ runner/app/pipelines/sfast.py | 29 ++++++++++++++++++++++++++ runner/app/pipelines/text_to_image.py | 10 +++++++++ runner/requirements.txt | 4 +++- 7 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 runner/app/pipelines/sfast.py diff --git a/runner/Dockerfile b/runner/Dockerfile index d28f1974..8d70bb76 100644 --- a/runner/Dockerfile +++ b/runner/Dockerfile @@ -30,12 +30,14 @@ RUN pyenv install $PYTHON_VERSION && \ # Upgrade pip and install your desired packages ARG PIP_VERSION=23.3.2 RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools wheel && \ - pip install --no-cache-dir torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 + pip install --no-cache-dir torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 WORKDIR /app COPY ./requirements.txt /app RUN pip install --no-cache-dir -r requirements.txt +RUN pip install https://github.com/chengzeyi/stable-fast/releases/download/v1.0.3/stable_fast-1.0.3+torch211cu121-cp311-cp311-manylinux2014_x86_64.whl + # Most DL models are quite large in terms of memory, using workers is a HUGE # slowdown because of the fork and GIL with python. # Using multiple pods seems like a better default strategy. diff --git a/runner/README.md b/runner/README.md index a25da647..9196b2c8 100644 --- a/runner/README.md +++ b/runner/README.md @@ -3,7 +3,7 @@ ## Build Docker image ``` -docker build -t runner . +docker build -t livepeer/ai-runner:latest . ``` ## Download models @@ -19,6 +19,10 @@ pip install "huggingface_hub[cli]" ./dl-checkpoints.sh ``` +## Optimizations + +- Set the environment variable `SFAST=true` to enable dynamic compilation with [stable-fast](https://github.com/chengzeyi/stable-fast) to speed up inference for diffusion pipelines (the initial requests will be slower because the model will be dynamically compiled then). + ## Run text-to-image container Run container: diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py index 2cd50720..2e009a83 100644 --- a/runner/app/pipelines/image_to_image.py +++ b/runner/app/pipelines/image_to_image.py @@ -7,6 +7,7 @@ import PIL from typing import List import logging +import os from PIL import ImageFile @@ -34,6 +35,15 @@ def __init__(self, model_id: str): self.ldm = AutoPipelineForImage2Image.from_pretrained(model_id, **kwargs) self.ldm.to(get_torch_device()) + if os.environ.get("SFAST"): + logger.info( + "ImageToImagePipeline will be dynamicallly compiled with stable-fast for %s", + model_id, + ) + from app.pipelines.sfast import compile_model + + self.ldm = compile_model(self.ldm) + def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]: seed = kwargs.pop("seed", None) if seed is not None: diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index d73cba33..b6308823 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -7,6 +7,7 @@ import PIL from typing import List import logging +import os from PIL import ImageFile @@ -34,6 +35,15 @@ def __init__(self, model_id: str): self.ldm = StableVideoDiffusionPipeline.from_pretrained(model_id, **kwargs) self.ldm.to(get_torch_device()) + if os.environ.get("SFAST"): + logger.info( + "ImageToVideoPipeline will be dynamicallly compiled with stable-fast for %s", + model_id, + ) + from app.pipelines.sfast import compile_model + + self.ldm = compile_model(self.ldm) + def __call__(self, image: PIL.Image, **kwargs) -> List[List[PIL.Image]]: if "decode_chunk_size" not in kwargs: kwargs["decode_chunk_size"] = 8 diff --git a/runner/app/pipelines/sfast.py b/runner/app/pipelines/sfast.py new file mode 100644 index 00000000..c9b450f2 --- /dev/null +++ b/runner/app/pipelines/sfast.py @@ -0,0 +1,29 @@ +from sfast.compilers.diffusion_pipeline_compiler import compile, CompilationConfig +import logging + +logger = logging.getLogger(__name__) + + +def compile_model(model): + config = CompilationConfig.Default() + + # xformers and Triton are suggested for achieving best performance. + # It might be slow for Triton to generate, compile and fine-tune kernels. + try: + import xformers + + config.enable_xformers = True + except ImportError: + logger.info("xformers not installed, skip") + # NOTE: + # When GPU VRAM is insufficient or the architecture is too old, Triton might be slow. + # Disable Triton if you encounter this problem. + try: + import triton + + config.enable_triton = True + except ImportError: + logger.info("Triton not installed, skip") + + model = compile(model, config) + return model diff --git a/runner/app/pipelines/text_to_image.py b/runner/app/pipelines/text_to_image.py index 25480ff2..eb6b5292 100644 --- a/runner/app/pipelines/text_to_image.py +++ b/runner/app/pipelines/text_to_image.py @@ -7,6 +7,7 @@ import PIL from typing import List import logging +import os logger = logging.getLogger(__name__) @@ -30,6 +31,15 @@ def __init__(self, model_id: str): self.ldm = AutoPipelineForText2Image.from_pretrained(model_id, **kwargs) self.ldm.to(get_torch_device()) + if os.environ.get("SFAST"): + logger.info( + "TextToImagePipeline will be dynamicallly compiled with stable-fast for %s", + model_id, + ) + from app.pipelines.sfast import compile_model + + self.ldm = compile_model(self.ldm) + def __call__(self, prompt: str, **kwargs) -> List[PIL.Image]: seed = kwargs.pop("seed", None) if seed is not None: diff --git a/runner/requirements.txt b/runner/requirements.txt index 0f39df9f..cc953d5f 100644 --- a/runner/requirements.txt +++ b/runner/requirements.txt @@ -6,4 +6,6 @@ pydantic Pillow python-multipart uvicorn -huggingface_hub \ No newline at end of file +huggingface_hub +xformers==0.0.23 +triton>=2.1.0 \ No newline at end of file