livepeer · stronk-dev · Apr 11, 2024 · Apr 11, 2024 · Apr 14, 2024 · Apr 14, 2024
@@ -3,6 +3,7 @@
 
 from diffusers import (
  AutoPipelineForImage2Image,
+ StableDiffusionInstructPix2PixPipeline,
  StableDiffusionXLPipeline,
  UNet2DConditionModel,
  EulerDiscreteScheduler,
@@ -11,6 +12,7 @@
 from huggingface_hub import file_download, hf_hub_download
 import torch
 import PIL
+import random
 from typing import List
 import logging
 import os
@@ -22,6 +24,7 @@
 logger = logging.getLogger(__name__)
 
 SDXL_LIGHTNING_MODEL_ID = "ByteDance/SDXL-Lightning"
+PIX2PIX_MODEL_ID = "timbrooks/instruct-pix2pix"
 
 
 class ImageToImagePipeline(Pipeline):
@@ -87,6 +90,12 @@ def __init__(self, model_id: str):
  self.ldm.scheduler = EulerDiscreteScheduler.from_config(
  self.ldm.scheduler.config, timestep_spacing="trailing"
  )
+ elif PIX2PIX_MODEL_ID in model_id:
+ kwargs["torch_dtype"] = torch.float16
+ kwargs["variant"] = "fp16"
+ self.ldm = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+ model_id, **kwargs
+ ).to(torch_device)
  else:
  self.ldm = AutoPipelineForImage2Image.from_pretrained(
  model_id, **kwargs
@@ -142,6 +151,11 @@ def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]:
  else:
  # Default to 2step
  kwargs["num_inference_steps"] = 2
+ elif PIX2PIX_MODEL_ID in self.model_id:
+ if "image_guidance_scale" not in kwargs:
+ kwargs["image_guidance_scale"] = round(random.uniform(1.2, 1.8), ndigits=2)
+ if "num_inference_steps" not in kwargs:
+ kwargs["num_inference_steps"] = 50
 
  return self.ldm(prompt, image=image, **kwargs).images
 

@@ -36,6 +36,7 @@ async def image_to_image(
  model_id: Annotated[str, Form()] = "",
  strength: Annotated[float, Form()] = 0.8,
  guidance_scale: Annotated[float, Form()] = 7.5,
+ image_guidance_scale: Annotated[float, Form()] = 0,
  negative_prompt: Annotated[str, Form()] = "",
  seed: Annotated[int, Form()] = None,
  num_images_per_prompt: Annotated[int, Form()] = 1,
@@ -80,6 +81,7 @@ async def image_to_image(
  image=image,
  strength=strength,
  guidance_scale=guidance_scale,
+ image_guidance_scale=image_guidance_scale,
  negative_prompt=negative_prompt,
  seed=seed,
  num_images_per_prompt=num_images_per_prompt,

@@ -60,6 +60,7 @@ if [ "$MODE" = "alpha" ]; then
 
  # Download text-to-image and image-to-image models.
  huggingface-cli download ByteDance/SDXL-Lightning --include "*unet.safetensors" --exclude "*lora.safetensors*" --cache-dir models
+ huggingface-cli download timbrooks/instruct-pix2pix --include "*fp16.safetensors" --exclude "*lora.safetensors*" --cache-dir models
 
  # Download image-to-video models (token-gated).
  printf "\nDownloading token-gated models...\n"

@@ -255,6 +255,11 @@
  "title": "Guidance Scale",
  "default": 7.5
  },
+ "image_guidance_scale": {
+ "type": "number",
+ "title": "Guidance Scale",
+ "default": 7.5
+ },
  "negative_prompt": {
  "type": "string",
  "title": "Negative Prompt",