From 77c1c145aef430eb46f0aa1b36213358e751b1eb Mon Sep 17 00:00:00 2001 From: Marco van Dijk Date: Thu, 11 Apr 2024 21:52:06 +0200 Subject: [PATCH 1/5] Add support for the `timbrooks/instruct-pix2pix` model --- runner/app/pipelines/image_to_image.py | 6 ++++++ runner/dl_checkpoints.sh | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py index efa4a958..69fd0456 100644 --- a/runner/app/pipelines/image_to_image.py +++ b/runner/app/pipelines/image_to_image.py @@ -3,6 +3,7 @@ from diffusers import ( AutoPipelineForImage2Image, + StableDiffusionInstructPix2PixPipeline, StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler, @@ -22,6 +23,7 @@ logger = logging.getLogger(__name__) SDXL_LIGHTNING_MODEL_ID = "ByteDance/SDXL-Lightning" +PIX2PIX_MODEL_ID = "timbrooks/instruct-pix2pix" class ImageToImagePipeline(Pipeline): @@ -87,6 +89,10 @@ def __init__(self, model_id: str): self.ldm.scheduler = EulerDiscreteScheduler.from_config( self.ldm.scheduler.config, timestep_spacing="trailing" ) + elif PIX2PIX_MODEL_ID in model_id: + self.ldm = StableDiffusionInstructPix2PixPipeline.from_pretrained( + model_id, **kwargs + ).to(torch_device) else: self.ldm = AutoPipelineForImage2Image.from_pretrained( model_id, **kwargs diff --git a/runner/dl_checkpoints.sh b/runner/dl_checkpoints.sh index 33751bad..c922522c 100755 --- a/runner/dl_checkpoints.sh +++ b/runner/dl_checkpoints.sh @@ -60,6 +60,7 @@ if [ "$MODE" = "alpha" ]; then # Download text-to-image and image-to-image models. huggingface-cli download ByteDance/SDXL-Lightning --include "*unet.safetensors" --exclude "*lora.safetensors*" --cache-dir models + huggingface-cli download timbrooks/instruct-pix2pix --include "*fp16.safetensors" --exclude "*lora.safetensors*" --cache-dir models # Download image-to-video models (token-gated). printf "\nDownloading token-gated models...\n" @@ -80,6 +81,9 @@ else # Download image-to-video models. huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --include "*.fp16.safetensors" "*.json" --cache-dir models + # Download text-to-video models. + huggingface-cli download ali-vilab/text-to-video-ms-1.7b --include "*.fp16.safetensors" "*.json" --cache-dir models + # Download image-to-video models (token-gated). printf "\nDownloading token-gated models...\n" check_hf_auth From c648ea2e3aa08d901a7b30c178004f75aaff5457 Mon Sep 17 00:00:00 2001 From: Marco van Dijk Date: Thu, 11 Apr 2024 22:33:55 +0200 Subject: [PATCH 2/5] Remove text-to-vid model --- runner/dl_checkpoints.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/runner/dl_checkpoints.sh b/runner/dl_checkpoints.sh index c922522c..2852d1b8 100755 --- a/runner/dl_checkpoints.sh +++ b/runner/dl_checkpoints.sh @@ -81,9 +81,6 @@ else # Download image-to-video models. huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --include "*.fp16.safetensors" "*.json" --cache-dir models - # Download text-to-video models. - huggingface-cli download ali-vilab/text-to-video-ms-1.7b --include "*.fp16.safetensors" "*.json" --cache-dir models - # Download image-to-video models (token-gated). printf "\nDownloading token-gated models...\n" check_hf_auth From 8fa4d91ca332b5b5fcc8176ebd074ee243b31910 Mon Sep 17 00:00:00 2001 From: Marco van Dijk Date: Sun, 14 Apr 2024 11:48:56 +0200 Subject: [PATCH 3/5] Tweak pix2pix --- runner/app/pipelines/image_to_image.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py index 69fd0456..1a279ceb 100644 --- a/runner/app/pipelines/image_to_image.py +++ b/runner/app/pipelines/image_to_image.py @@ -12,6 +12,7 @@ from huggingface_hub import file_download, hf_hub_download import torch import PIL +import random from typing import List import logging import os @@ -90,6 +91,8 @@ def __init__(self, model_id: str): self.ldm.scheduler.config, timestep_spacing="trailing" ) elif PIX2PIX_MODEL_ID in model_id: + kwargs["torch_dtype"] = torch.float16 + kwargs["variant"] = "fp16" self.ldm = StableDiffusionInstructPix2PixPipeline.from_pretrained( model_id, **kwargs ).to(torch_device) @@ -148,6 +151,10 @@ def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]: else: # Default to 2step kwargs["num_inference_steps"] = 2 + elif PIX2PIX_MODEL_ID in self.model_id: + kwargs["guidance_scale"] = round(random.uniform(6.0, 9.0), ndigits=2) + kwargs["image_guidance_scale"] = round(random.uniform(1.2, 1.8), ndigits=2) + kwargs["num_inference_steps"] = 50 return self.ldm(prompt, image=image, **kwargs).images From 7931c0300f630c34a5bf1e72b439255eb814cf8f Mon Sep 17 00:00:00 2001 From: Marco van Dijk Date: Sun, 14 Apr 2024 11:51:31 +0200 Subject: [PATCH 4/5] Do not randomize guidance scale --- runner/app/pipelines/image_to_image.py | 1 - 1 file changed, 1 deletion(-) diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py index 1a279ceb..5f6e8b6f 100644 --- a/runner/app/pipelines/image_to_image.py +++ b/runner/app/pipelines/image_to_image.py @@ -152,7 +152,6 @@ def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]: # Default to 2step kwargs["num_inference_steps"] = 2 elif PIX2PIX_MODEL_ID in self.model_id: - kwargs["guidance_scale"] = round(random.uniform(6.0, 9.0), ndigits=2) kwargs["image_guidance_scale"] = round(random.uniform(1.2, 1.8), ndigits=2) kwargs["num_inference_steps"] = 50 From 76d59bb9fb26ac9517764bba866becf8f226c74c Mon Sep 17 00:00:00 2001 From: Marco van Dijk Date: Mon, 15 Apr 2024 20:11:35 +0200 Subject: [PATCH 5/5] Make image guidance scale a param --- runner/app/pipelines/image_to_image.py | 6 ++-- runner/app/routes/image_to_image.py | 2 ++ runner/openapi.json | 5 ++++ worker/runner.gen.go | 39 +++++++++++++------------- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/runner/app/pipelines/image_to_image.py b/runner/app/pipelines/image_to_image.py index 5f6e8b6f..4f9d348e 100644 --- a/runner/app/pipelines/image_to_image.py +++ b/runner/app/pipelines/image_to_image.py @@ -152,8 +152,10 @@ def __call__(self, prompt: str, image: PIL.Image, **kwargs) -> List[PIL.Image]: # Default to 2step kwargs["num_inference_steps"] = 2 elif PIX2PIX_MODEL_ID in self.model_id: - kwargs["image_guidance_scale"] = round(random.uniform(1.2, 1.8), ndigits=2) - kwargs["num_inference_steps"] = 50 + if "image_guidance_scale" not in kwargs: + kwargs["image_guidance_scale"] = round(random.uniform(1.2, 1.8), ndigits=2) + if "num_inference_steps" not in kwargs: + kwargs["num_inference_steps"] = 50 return self.ldm(prompt, image=image, **kwargs).images diff --git a/runner/app/routes/image_to_image.py b/runner/app/routes/image_to_image.py index 6af61bca..2a4f60af 100644 --- a/runner/app/routes/image_to_image.py +++ b/runner/app/routes/image_to_image.py @@ -36,6 +36,7 @@ async def image_to_image( model_id: Annotated[str, Form()] = "", strength: Annotated[float, Form()] = 0.8, guidance_scale: Annotated[float, Form()] = 7.5, + image_guidance_scale: Annotated[float, Form()] = 0, negative_prompt: Annotated[str, Form()] = "", seed: Annotated[int, Form()] = None, num_images_per_prompt: Annotated[int, Form()] = 1, @@ -80,6 +81,7 @@ async def image_to_image( image=image, strength=strength, guidance_scale=guidance_scale, + image_guidance_scale=image_guidance_scale, negative_prompt=negative_prompt, seed=seed, num_images_per_prompt=num_images_per_prompt, diff --git a/runner/openapi.json b/runner/openapi.json index 57dcc4a0..62270e7a 100644 --- a/runner/openapi.json +++ b/runner/openapi.json @@ -255,6 +255,11 @@ "title": "Guidance Scale", "default": 7.5 }, + "image_guidance_scale": { + "type": "number", + "title": "Guidance Scale", + "default": 7.5 + }, "negative_prompt": { "type": "string", "title": "Negative Prompt", diff --git a/worker/runner.gen.go b/worker/runner.gen.go index 9e2e74b3..2fb431a0 100644 --- a/worker/runner.gen.go +++ b/worker/runner.gen.go @@ -35,6 +35,7 @@ type APIError struct { type BodyImageToImageImageToImagePost struct { GuidanceScale *float32 `json:"guidance_scale,omitempty"` Image openapi_types.File `json:"image"` + ImageGuidanceScale *float32 `json:"image_guidance_scale,omitempty"` ModelId *string `json:"model_id,omitempty"` NegativePrompt *string `json:"negative_prompt,omitempty"` NumImagesPerPrompt *int `json:"num_images_per_prompt,omitempty"` @@ -1077,25 +1078,25 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+xXS2/jNhD+KwTbozd23E1T+Jb0tUabbhC720MQGIw0lrkrkSw5TNcI/N8LkrZEvSqn", - "yKZAkZNew5lvZr556JEmslBSgEBDZ4/UJBsomL+9uJ7/qLXU7l5pqUAjB/+lMJm7IMcc6IxemYyOKG6V", - "ezCoucjobjeiGv60XENKZ7f+yN2oPFLqLs/J+4+QIN2N6KVMtytesAxWKPc3jUclDbZhZZanTCSwMglz", - "Vh5pCmtmc6Sz85OzyvjPezmy8HIlBGGLe9AOgrfiFKylLhjSGb3nguktrZTMvUjL7REtZAr5iqc1+zQ6", - "eeUEyDztOiwgY8gfYKW0LBT26vhtL0eug1yXKluEaJmVAt2l8DTSZwviPTLkGnRLKxcIWQhNpedwth+C", - "AUhjyYV77lJqUIPIcFODNzn5rgK4OEi0stUgmjqgCTmMOHcsrwYp+cBTkM3HbkqulanzsILzkzKdsdgA", - "zzb1RJ2df1udexe+dx39z2hbSORSrO5t8gmwqeR0eh5rcZLk0kvWtEV+CMkNrJjNVj3EmEwj6jphcmEz", - "0s+RJ1DxL542zJ1Opm8rc3/47+2TDRoOsK+fQh3se7dcXvd04hSQ8dzdfa1hTWf0q3HVz8f7Zj4uu20T", - "5f54BLOy1QPkA8t5ylwSByFxhMIMYWvq21VYfgiaSiBMa7b1PsRomwq6cAPLcfP9BpJPbbwGGdp6ldL3", - "v9C49XiBrglX1WRloMO+L7obMEoKA20EoUsfHbErSDmL4xQad1ecWow0ca7rsDpwB0vtiB1bS1bnsdzv", - "Oh/cE6yX8RYipAFIB8IlfMal9I5cM81C8L7UVlB15iN68esa8PQ1oOy9T2y2ezARYdq86CDPYCvLZVKr", - "Sia279d0dvvY8vGxBfEuKtBfZeLNtEp01FqlwZieAR1eVKIeM1m6t0NF5fwIpvaSUaSOaJ8f3HTqb19r", - "zYpG+3piH2vEpNyQguKBvrY3H7tUw9tyyDMysZrjduGgBOxulFwC06DL3yB36D68KpVsEBXdOR1crGWo", - "I5Nornx+Z/RCEKZUzkPCCUqirSAXc6K4gpyL4M+BF/wBFIB232+sEN7QA2gTdE1OTk8mLiBSgWCK0xn9", - "xr8aUcVw42GPN370+EYHvh5darzxeVpOJupCFuLhT00nE3dJpEAQ/lQEevzROPOHf8GhNMazzwemHpCF", - "TRIwZm1zUqbEp8AWhVtNS4ju5dh3qjco35Sr7GGvrrvlK3tf4DTwAQy6HavhV2Fz5IppHLud+E3KkB3v", - "2rF/DLs6J1Fb2H3BiNfn9rExH9G3z5n1ck/ssH/JUnITUuLtTqfPare1MrYRVCKkXCvPXsr9uUDQguVk", - "AfoBNKl270Pf8TMk7ji3d7u7uCZ8islShmncqA3/tzBYG74LvlRt9P/PvHBt1Hv/a238n2sjMNzXBsJn", - "PGJsRGvhP1bGv3e+vXi+DofXAnjeAnAci2fDbvd3AAAA//92bsyPxhcAAA==", + "H4sIAAAAAAAC/+xXS2/jNhD+KwTboxM77qYpfEv6WqNNN4jd7SEIBEYay9yVSJaPdA1D/70gKUvUq3K6", + "2RwWOek1nPlm5puH9jjmueAMmFZ4sccq3kJO3O3lzfJnKbm090JyAVJTcF9yldqLpjoDvMDXKsUTrHfC", + "PigtKUtxUUywhL8NlZDgxZ07cj+pjlS6q3P84QPEGhcTfMWTXURzkkKkeXnTehRc6S6s1NCEsBgiFRNr", + "ZY8T2BCTaby4OD2vjf9ayqGVk6sgMJM/gLQQnBWrYMNlTjRe4AfKiNzhWsnSiXTcLs9Gz4cl5wlkEU0a", + "OnCA5NoKoGXSB4ZBSjR9hEhIngs9qOOPUg7deLk+VSb30VeRANmn8CzQZ3LkIqTQDciOVso0pN69Ws/h", + "7DAEBZCEkiv73KdUaQks1dsGvNnpDzXA1UGiE/EWccUBjedEwOFjeTpK8UeaAG8/9lN8I1STSzWcX4Tq", + "jcUWaLptJur84vv63Fv/ve/o55TBZ9E255pyFj2Y+CPotpKz+UWoxUqiKyfZ0Bb4wThVEBGTRgPEmM0D", + "6lphdGlSNMyRJ1DxH5q0zJ3N5m9qc3+5792TLRqOsG+YQj3se7te3wx09gQ0oZm9+1bCBi/wN9N6PkzL", + "4TCtuncbZXk8gFnbGgDynmQ0ITaJo5CohlyNYWvrK2osP3lNFRAiJdk5H0K0bQV9uIFkevvjFuKPXbxK", + "E22aVYrf/YbD1uME+iZmXZO1gR77ruhuQQnOFHQR+C59dMSuIaEkjJNv3H1x6jBShbluwurB7S11I3Zs", + "LRmZhXJ/ymx07zBOxlkIkHogPQjX8EmvuXPkhkjig/eltoy6Mx/Ri1/XgKevAVXvfWKzLcEEhOnyooc8", + "o60s43GjKgnbvdvgxd2+4+O+A/E+KNDfeezMdEp00lnNQamBAe1f1KIOM1rbt2NFZf3wpkrJIFJHtM/3", + "djoNt6+NJHmrfT2xj7ViUm1IXvFIXyvNhy418HYccoyMjaR6t7JQPHY7Sq6ASJDVb5U99OBfVUq2Wgtc", + "WB2UbbivIxVLKlx+F/iSISJERn3CkeZIGoYul0hQARll3p8DL+gjCABpv98axpyhR5DK65qdnp3ObEC4", + "AEYExQv8nXs1wYLorYM93brR4xoduHq0qXHGl0k1mbANmY+HOzWfzewl5kwDc6cC0NMPypo//FuOpTGc", + "fS4wzYCsTByDUhuToSolLgUmz+1qWkG0L6euU51oflKtsoe9uumWq+yywLHnAyhtd6yWX7nJNBVE6qnd", + "iU8Sosnxrh37x1A0OamlgeILRrw5t4+N+QS/ec6sV3tij/0rkqBbnxJndz5/VrudlbGLoBZB1Vp5/lLu", + "L5kGyUiGViAfQaJ69z70HTdDwo5zd1/chzXhUozW3E/jVm24v4XR2nBd8KVqY/h/5oVro9n7X2vja64N", + "z3BXGxo+6SPGRrAW/mdl/H/nu4vn63B4LYDnLQDLsXA2FMW/AQAA//+fOxznFhgAAA==", } // GetSwagger returns the content of the embedded swagger specification file