From 8789d0b6c7eed14e1f751022e74bdb0670f0e1b4 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 13 Nov 2023 15:35:15 +0100 Subject: [PATCH 01/13] fix styling issues on main (#5754) fix styling issues --- examples/text_to_image/train_text_to_image_flax.py | 5 +---- examples/text_to_image/train_text_to_image_lora_sdxl.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index 9ebe34555310..e62d03c730b1 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -272,10 +272,7 @@ def main(): if args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( - args.dataset_name, - args.dataset_config_name, - cache_dir=args.cache_dir, - data_dir=args.train_data_dir + args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir ) else: data_files = {} diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index 1a6ef0c856db..b69940603128 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -765,10 +765,7 @@ def load_model_hook(models, input_dir): if args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( - args.dataset_name, - args.dataset_config_name, - cache_dir=args.cache_dir, - data_dir=args.train_data_dir + args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir ) else: data_files = {} From c9f847a70f7d44f6a856fd61d4fa03dbbab72fdc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 13 Nov 2023 18:37:55 +0100 Subject: [PATCH 02/13] [Wuerstchen] fix for when USE_PEFT_BACKEND is True (#5704) * fix for when USE_PEFT_BACKEND is True * Update modeling_wuerstchen_prior.py * revert change * add lora tests --- .../wuerstchen/modeling_wuerstchen_common.py | 18 ++++- .../wuerstchen/modeling_wuerstchen_prior.py | 14 ++-- .../wuerstchen/test_wuerstchen_prior.py | 77 ++++++++++++++++++- 3 files changed, 99 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py index b3aac39386bc..00d6f01beced 100644 --- a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +++ b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py @@ -17,6 +17,8 @@ import torch.nn as nn from ...models.attention_processor import Attention +from ...models.lora import LoRACompatibleConv, LoRACompatibleLinear +from ...utils import USE_PEFT_BACKEND class WuerstchenLayerNorm(nn.LayerNorm): @@ -32,7 +34,8 @@ def forward(self, x): class TimestepBlock(nn.Module): def __init__(self, c, c_timestep): super().__init__() - self.mapper = nn.Linear(c_timestep, c * 2) + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + self.mapper = linear_cls(c_timestep, c * 2) def forward(self, x, t): a, b = self.mapper(t)[:, :, None, None].chunk(2, dim=1) @@ -42,10 +45,14 @@ def forward(self, x, t): class ResBlock(nn.Module): def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0): super().__init__() - self.depthwise = nn.Conv2d(c + c_skip, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c) + + conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + + self.depthwise = conv_cls(c + c_skip, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c) self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6) self.channelwise = nn.Sequential( - nn.Linear(c, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), nn.Linear(c * 4, c) + linear_cls(c, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), linear_cls(c * 4, c) ) def forward(self, x, x_skip=None): @@ -73,10 +80,13 @@ def forward(self, x): class AttnBlock(nn.Module): def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0): super().__init__() + + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + self.self_attn = self_attn self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6) self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True) - self.kv_mapper = nn.Sequential(nn.SiLU(), nn.Linear(c_cond, c)) + self.kv_mapper = nn.Sequential(nn.SiLU(), linear_cls(c_cond, c)) def forward(self, x, kv): kv = self.kv_mapper(kv) diff --git a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py index ca72ce581fcc..a7d9e32fb6c9 100644 --- a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py @@ -28,8 +28,9 @@ AttnAddedKVProcessor, AttnProcessor, ) +from ...models.lora import LoRACompatibleConv, LoRACompatibleLinear from ...models.modeling_utils import ModelMixin -from ...utils import is_torch_version +from ...utils import USE_PEFT_BACKEND, is_torch_version from .modeling_wuerstchen_common import AttnBlock, ResBlock, TimestepBlock, WuerstchenLayerNorm @@ -40,12 +41,15 @@ class WuerstchenPrior(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @register_to_config def __init__(self, c_in=16, c=1280, c_cond=1024, c_r=64, depth=16, nhead=16, dropout=0.1): super().__init__() + conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + self.c_r = c_r - self.projection = nn.Conv2d(c_in, c, kernel_size=1) + self.projection = conv_cls(c_in, c, kernel_size=1) self.cond_mapper = nn.Sequential( - nn.Linear(c_cond, c), + linear_cls(c_cond, c), nn.LeakyReLU(0.2), - nn.Linear(c, c), + linear_cls(c, c), ) self.blocks = nn.ModuleList() @@ -55,7 +59,7 @@ def __init__(self, c_in=16, c=1280, c_cond=1024, c_r=64, depth=16, nhead=16, dro self.blocks.append(AttnBlock(c, c, nhead, self_attn=True, dropout=dropout)) self.out = nn.Sequential( WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6), - nn.Conv2d(c, c_in * 2, kernel_size=1), + conv_cls(c, c_in * 2, kernel_size=1), ) self.gradient_checkpointing = False diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py index 59dbc90b98ab..5e1b89c0d2e0 100644 --- a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py +++ b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py @@ -17,11 +17,24 @@ import numpy as np import torch +import torch.nn as nn +import torch.nn.functional as F from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline +from diffusers.loaders import AttnProcsLayers +from diffusers.models.attention_processor import ( + LoRAAttnProcessor, + LoRAAttnProcessor2_0, +) from diffusers.pipelines.wuerstchen import WuerstchenPrior -from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device +from diffusers.utils.import_utils import is_peft_available +from diffusers.utils.testing_utils import enable_full_determinism, require_peft_backend, skip_mps, torch_device + + +if is_peft_available(): + from peft import LoraConfig + from peft.tuners.tuners_utils import BaseTunerLayer from ..test_pipelines_common import PipelineTesterMixin @@ -29,6 +42,19 @@ enable_full_determinism() +def create_prior_lora_layers(unet: nn.Module): + lora_attn_procs = {} + for name in unet.attn_processors.keys(): + lora_attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + lora_attn_procs[name] = lora_attn_processor_class( + hidden_size=unet.config.c, + ) + unet_lora_layers = AttnProcsLayers(lora_attn_procs) + return lora_attn_procs, unet_lora_layers + + class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = WuerstchenPriorPipeline params = ["prompt"] @@ -219,3 +245,52 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): output = pipe(**inputs)[0] assert output.abs().sum() == 0 + + def check_if_lora_correctly_set(self, model) -> bool: + """ + Checks if the LoRA layers are correctly set with peft + """ + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + return True + return False + + def get_lora_components(self): + prior = self.dummy_prior + + prior_lora_config = LoraConfig( + r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False + ) + + prior_lora_attn_procs, prior_lora_layers = create_prior_lora_layers(prior) + + lora_components = { + "prior_lora_layers": prior_lora_layers, + "prior_lora_attn_procs": prior_lora_attn_procs, + } + + return prior, prior_lora_config, lora_components + + @require_peft_backend + def test_inference_with_prior_lora(self): + _, prior_lora_config, _ = self.get_lora_components() + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output_no_lora = pipe(**self.get_dummy_inputs(device)) + image_embed = output_no_lora.image_embeddings + self.assertTrue(image_embed.shape == (1, 2, 24, 24)) + + pipe.prior.add_adapter(prior_lora_config) + self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior") + + output_lora = pipe(**self.get_dummy_inputs(device)) + lora_image_embed = output_lora.image_embeddings + + self.assertTrue(image_embed.shape == lora_image_embed.shape) From 1ce4b5f3e3235d27babe9b0023624b99930c7ee4 Mon Sep 17 00:00:00 2001 From: Jianqi Pan Date: Tue, 14 Nov 2023 02:54:21 +0900 Subject: [PATCH 03/13] fix: fix forward function signature of controlnet reference_only pipeline example (#5717) fix: ignore other args Co-authored-by: Patrick von Platen --- examples/community/stable_diffusion_controlnet_reference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index d786036bd58a..b18f27664037 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -546,7 +546,7 @@ def hack_CrossAttnDownBlock2D_forward( return hidden_states, output_states - def hacked_DownBlock2D_forward(self, hidden_states, temb=None): + def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs): eps = 1e-6 output_states = () @@ -642,7 +642,7 @@ def hacked_CrossAttnUpBlock2D_forward( return hidden_states - def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): + def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, *args, **kwargs): eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states From ef7787ea597425889e09c00c15093d21c7435b0f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 13 Nov 2023 18:54:53 +0100 Subject: [PATCH 04/13] make style --- examples/community/stable_diffusion_controlnet_reference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index b18f27664037..358fc1c6dc67 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -642,7 +642,9 @@ def hacked_CrossAttnUpBlock2D_forward( return hidden_states - def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, *args, **kwargs): + def hacked_UpBlock2D_forward( + self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, *args, **kwargs + ): eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states From 0488810f613932b84146ae7462f995e698eeef62 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 13 Nov 2023 17:55:17 +0000 Subject: [PATCH 05/13] Fix realfill example compatibility with latest torchvision version (#5736) --- examples/research_projects/realfill/requirements.txt | 2 +- examples/research_projects/realfill/train_realfill.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/research_projects/realfill/requirements.txt b/examples/research_projects/realfill/requirements.txt index bf14291f53a9..3827f0852a20 100644 --- a/examples/research_projects/realfill/requirements.txt +++ b/examples/research_projects/realfill/requirements.txt @@ -3,7 +3,7 @@ accelerate==0.23.0 transformers==4.34.0 peft==0.5.0 torch==2.0.1 -torchvision==0.15.2 +torchvision>=0.16 ftfy==6.1.1 tensorboard==2.14.0 Jinja2==3.1.2 diff --git a/examples/research_projects/realfill/train_realfill.py b/examples/research_projects/realfill/train_realfill.py index 9d00a21b1a74..1549d813058d 100644 --- a/examples/research_projects/realfill/train_realfill.py +++ b/examples/research_projects/realfill/train_realfill.py @@ -450,10 +450,10 @@ def __init__( self.transform = transforms_v2.Compose( [ + transforms_v2.ToImage(), transforms_v2.RandomResize(size, int(1.125 * size)), transforms_v2.RandomCrop(size), - transforms_v2.ToImageTensor(), - transforms_v2.ConvertImageDtype(), + transforms_v2.ToDtype(torch.float32, scale=True), transforms_v2.Normalize([0.5], [0.5]), ] ) From 8fcd52febbc63eb6b1966b20a1b0268523dd68f7 Mon Sep 17 00:00:00 2001 From: "Thuan H. Nguyen" <32274287+thuanz123@users.noreply.github.com> Date: Tue, 14 Nov 2023 01:01:15 +0700 Subject: [PATCH 06/13] Correct code for distributed training of RealFill (#5740) Correct code for distributed training --- examples/research_projects/realfill/train_realfill.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/research_projects/realfill/train_realfill.py b/examples/research_projects/realfill/train_realfill.py index 1549d813058d..e251d8d1769c 100644 --- a/examples/research_projects/realfill/train_realfill.py +++ b/examples/research_projects/realfill/train_realfill.py @@ -639,7 +639,7 @@ def save_model_hook(models, weights, output_dir): for model in models: sub_dir = ( "unet" - if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet.base_model.model))) + if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model)) else "text_encoder" ) model.save_pretrained(os.path.join(output_dir, sub_dir)) @@ -654,12 +654,12 @@ def load_model_hook(models, input_dir): sub_dir = ( "unet" - if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet.base_model.model))) + if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model)) else "text_encoder" ) model_cls = ( UNet2DConditionModel - if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet.base_model.model))) + if isinstance(model.base_model.model, type(accelerator.unwrap_model(unet).base_model.model)) else CLIPTextModel ) @@ -937,8 +937,8 @@ def load_model_hook(models, input_dir): if accelerator.is_main_process: pipeline = StableDiffusionInpaintPipeline.from_pretrained( args.pretrained_model_name_or_path, - unet=accelerator.unwrap_model(unet.merge_and_unload(), keep_fp32_wrapper=True), - text_encoder=accelerator.unwrap_model(text_encoder.merge_and_unload(), keep_fp32_wrapper=True), + unet=accelerator.unwrap_model(unet, keep_fp32_wrapper=True).merge_and_unload(), + text_encoder=accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True).merge_and_unload(), revision=args.revision, ) From 4d3b4e00edb76abf928bb651e31f29f34d4287b7 Mon Sep 17 00:00:00 2001 From: JacobYuan7 <30721381+JacobYuan7@users.noreply.github.com> Date: Tue, 14 Nov 2023 02:06:18 +0800 Subject: [PATCH 07/13] Update the reference for text_to_video.md (#5706) * Update the reference for text_to_video.md The original reference (VideoFusion) might be misleading. VideoFusion is not open-sourced. I am the co-first author of ModelScopeT2V. I change the referred paper to the right one. * Update docs/source/en/api/pipelines/text_to_video.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: YiYi Xu Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/api/pipelines/text_to_video.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video.md b/docs/source/en/api/pipelines/text_to_video.md index 6d28fb0e29d0..e6e081cfa645 100644 --- a/docs/source/en/api/pipelines/text_to_video.md +++ b/docs/source/en/api/pipelines/text_to_video.md @@ -18,11 +18,11 @@ specific language governing permissions and limitations under the License. # Text-to-video -[VideoFusion: Decomposed Diffusion Models for High-Quality Video Generation](https://huggingface.co/papers/2303.08320) is by Zhengxiong Luo, Dayou Chen, Yingya Zhang, Yan Huang, Liang Wang, Yujun Shen, Deli Zhao, Jingren Zhou, Tieniu Tan. +[ModelScope Text-to-Video Technical Report](https://arxiv.org/abs/2308.06571) is by Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, Shiwei Zhang. The abstract from the paper is: -*A diffusion probabilistic model (DPM), which constructs a forward diffusion process by gradually adding noise to data points and learns the reverse denoising process to generate new samples, has been shown to handle complex data distribution. Despite its recent success in image synthesis, applying DPMs to video generation is still challenging due to high-dimensional data spaces. Previous methods usually adopt a standard diffusion process, where frames in the same video clip are destroyed with independent noises, ignoring the content redundancy and temporal correlation. This work presents a decomposed diffusion process via resolving the per-frame noise into a base noise that is shared among all frames and a residual noise that varies along the time axis. The denoising pipeline employs two jointly-learned networks to match the noise decomposition accordingly. Experiments on various datasets confirm that our approach, termed as VideoFusion, surpasses both GAN-based and diffusion-based alternatives in high-quality video generation. We further show that our decomposed formulation can benefit from pre-trained image diffusion models and well-support text-conditioned video creation.* +*This paper introduces ModelScopeT2V, a text-to-video synthesis model that evolves from a text-to-image synthesis model (i.e., Stable Diffusion). ModelScopeT2V incorporates spatio-temporal blocks to ensure consistent frame generation and smooth movement transitions. The model could adapt to varying frame numbers during training and inference, rendering it suitable for both image-text and video-text datasets. ModelScopeT2V brings together three components (i.e., VQGAN, a text encoder, and a denoising UNet), totally comprising 1.7 billion parameters, in which 0.5 billion parameters are dedicated to temporal capabilities. The model demonstrates superior performance over state-of-the-art methods across three evaluation metrics. The code and an online demo are available at https://modelscope.cn/models/damo/text-to-video-synthesis/summary.* You can find additional information about Text-to-Video on the [project page](https://modelscope.cn/models/damo/text-to-video-synthesis/summary), [original codebase](https://github.com/modelscope/modelscope/), and try it out in a [demo](https://huggingface.co/spaces/damo-vilab/modelscope-text-to-video-synthesis). Official checkpoints can be found at [damo-vilab](https://huggingface.co/damo-vilab) and [cerspense](https://huggingface.co/cerspense). @@ -177,4 +177,4 @@ Here are some sample outputs: - __call__ ## TextToVideoSDPipelineOutput -[[autodoc]] pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput \ No newline at end of file +[[autodoc]] pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput From 80e78d2cac7d65c0f6171688d5bfde3f834ba99f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:01:52 -0800 Subject: [PATCH 08/13] [docs] Custom community components (#5732) * fixes * feedback --- .../custom_pipeline_overview.md | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md index f602e73eb2c6..10627d3163d8 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.md +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md @@ -60,15 +60,13 @@ For more information about community pipelines, take a look at the [Community pi ## Community components -If your pipeline has custom components that Diffusers doesn't support already, you need to accompany the Python modules that implement them. These customized components could be VAE, UNet, scheduler, etc. For the text encoder, we rely on `transformers` anyway. So, that should be handled separately (more info here). The pipeline code itself can be customized as well. +Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn't already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized. -Community components allow users to build pipelines that may have customized components that are not part of Diffusers. This section shows how users should use community components to build a community pipeline. +This section shows how users should use community components to build a community pipeline. -You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example here. Here, you have a custom UNet and a customized pipeline (`TextToVideoIFPipeline`). For convenience, let's call the UNet `ShowOneUNet3DConditionModel`. +You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. So, let's start loading the components: -"showlab/show-1-base" already provides the checkpoints in the Diffusers format, which is a great starting point. So, let's start loading up the components which are already well-supported: - -1. **Text encoder** +1. Import and load the text encoder from Transformers: ```python from transformers import T5Tokenizer, T5EncoderModel @@ -78,7 +76,7 @@ tokenizer = T5Tokenizer.from_pretrained(pipe_id, subfolder="tokenizer") text_encoder = T5EncoderModel.from_pretrained(pipe_id, subfolder="text_encoder") ``` -2. **Scheduler** +2. Load a scheduler: ```python from diffusers import DPMSolverMultistepScheduler @@ -86,7 +84,7 @@ from diffusers import DPMSolverMultistepScheduler scheduler = DPMSolverMultistepScheduler.from_pretrained(pipe_id, subfolder="scheduler") ``` -3. **Image processor** +3. Load an image processor: ```python from transformers import CLIPFeatureExtractor @@ -94,9 +92,15 @@ from transformers import CLIPFeatureExtractor feature_extractor = CLIPFeatureExtractor.from_pretrained(pipe_id, subfolder="feature_extractor") ``` -Now, you need to implement the custom UNet. The implementation is available [here](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py). So, let's create a Python script called `showone_unet_3d_condition.py` and copy over the implementation, changing the `UNet3DConditionModel` classname to `ShowOneUNet3DConditionModel` to avoid any conflicts with Diffusers. This is because Diffusers already has one `UNet3DConditionModel`. We put all the components needed to implement the class in `showone_unet_3d_condition.py` only. You can find the entire file [here](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). + + +In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py) and [pipeline](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) implementation must match the format shown in their files for this example to work. + + + +4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in the `showone_unet_3d_condition.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the `UNet3DConditionModel` class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in the `showone_unet_3d_condition.py` script. -Once this is done, we can initialize the UNet: +Once this is done, you can initialize the UNet: ```python from showone_unet_3d_condition import ShowOneUNet3DConditionModel @@ -104,9 +108,9 @@ from showone_unet_3d_condition import ShowOneUNet3DConditionModel unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet") ``` -Then implement the custom `TextToVideoIFPipeline` in another Python script: `pipeline_t2v_base_pixel.py`. This is already available [here](https://github.com/showlab/Show-1/blob/main/showone/pipelines/pipeline_t2v_base_pixel.py). +5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in the `pipeline_t2v_base_pixel.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in the `pipeline_t2v_base_pixel.py` script. -Now that you have all the components, initialize the `TextToVideoIFPipeline`: +Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`: ```python from pipeline_t2v_base_pixel import TextToVideoIFPipeline @@ -123,7 +127,7 @@ pipeline = pipeline.to(device="cuda") pipeline.torch_dtype = torch.float16 ``` -Push to the pipeline to the Hub to share with the community: +Push the pipeline to the Hub to share with the community! ```python pipeline.push_to_hub("custom-t2v-pipeline") @@ -131,11 +135,11 @@ pipeline.push_to_hub("custom-t2v-pipeline") After the pipeline is successfully pushed, you need a couple of changes: -1. In `model_index.json` file, change the `_class_name` attribute. It should be like [so](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2). -2. Upload `showone_unet_3d_condition.py` to the `unet` directory ([example](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py)). -3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base directory ([example](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py)). +1. Change the `_class_name` attribute in [`model_index.json`](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`. +2. Upload `showone_unet_3d_condition.py` to the `unet` [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). +3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). -To run inference, just do: +To run inference, simply add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes. ```python from diffusers import DiffusionPipeline @@ -161,6 +165,4 @@ video_frames = pipeline( guidance_scale=9.0, output_type="pt" ).frames -``` - -Here, notice the use of the `trust_remote_code` argument while initializing the pipeline. It is responsible for handling all the "magic" behind the scenes. +``` \ No newline at end of file From f782ca112a30b5e022a74ae029d47f4c62f7fca4 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Mon, 13 Nov 2023 12:11:07 -0800 Subject: [PATCH 09/13] [docs] Callbacks (#5735) * updates * feedback --- docs/source/en/_toctree.yml | 4 +-- docs/source/en/using-diffusers/callback.md | 29 +++++++++++-------- .../alt_diffusion/pipeline_alt_diffusion.py | 2 +- .../pipeline_alt_diffusion_img2img.py | 2 +- .../kandinsky2_2/pipeline_kandinsky2_2.py | 2 +- .../pipeline_kandinsky2_2_combined.py | 6 ++-- .../pipeline_kandinsky2_2_img2img.py | 2 +- .../pipeline_kandinsky2_2_inpainting.py | 2 +- .../pipeline_kandinsky2_2_prior.py | 2 +- .../pipeline_latent_consistency_img2img.py | 2 +- .../pipeline_latent_consistency_text2img.py | 2 +- .../pipeline_stable_diffusion.py | 2 +- .../pipeline_stable_diffusion_depth2img.py | 2 +- .../pipeline_stable_diffusion_img2img.py | 2 +- .../pipeline_stable_diffusion_inpaint.py | 2 +- ...eline_stable_diffusion_instruct_pix2pix.py | 2 +- .../pipeline_stable_diffusion_xl.py | 2 +- .../pipeline_stable_diffusion_xl_img2img.py | 2 +- .../pipeline_stable_diffusion_xl_inpaint.py | 2 +- .../wuerstchen/pipeline_wuerstchen.py | 2 +- .../pipeline_wuerstchen_combined.py | 4 +-- .../wuerstchen/pipeline_wuerstchen_prior.py | 2 +- 22 files changed, 42 insertions(+), 37 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a0c6159991b5..c7c330f000d0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -78,14 +78,14 @@ title: Kandinsky - local: using-diffusers/controlnet title: ControlNet - - local: using-diffusers/callback - title: Callback - local: using-diffusers/shap-e title: Shap-E - local: using-diffusers/diffedit title: DiffEdit - local: using-diffusers/distilled_sd title: Distilled Stable Diffusion inference + - local: using-diffusers/callback + title: Pipeline callbacks - local: using-diffusers/reproducibility title: Create reproducible pipelines - local: using-diffusers/custom_pipeline_examples diff --git a/docs/source/en/using-diffusers/callback.md b/docs/source/en/using-diffusers/callback.md index b4f16bda55eb..690d86c17a54 100644 --- a/docs/source/en/using-diffusers/callback.md +++ b/docs/source/en/using-diffusers/callback.md @@ -10,11 +10,19 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Using callback +# Pipeline callbacks -[[open-in-colab]] +The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. This can be really useful for *dynamically* adjusting certain pipeline attributes, or modifying tensor variables. The flexibility of callbacks opens up some interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. -Most 🤗 Diffusers pipelines now accept a `callback_on_step_end` argument that allows you to change the default behavior of denoising loop with custom defined functions. Here is an example of a callback function we can write to disable classifier-free guidance after 40% of inference steps to save compute with a minimum tradeoff in performance. +This guide will show you how to use the `callback_on_step_end` parameter to disable classifier-free guidance (CFG) after 40% of the inference steps to save compute with minimal cost to performance. + +The callback function should have the following arguments: + +* `pipe` (or the pipeline instance) provides access to useful properties such as `num_timestep` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipe._guidance_scale=0.0`. +* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timestep`. +* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly. + +Your callback function should look something like this: ```python def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs): @@ -29,14 +37,9 @@ def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs): return callback_kwargs ``` -Your callback function has below arguments: -* `pipe` is the pipeline instance, which provides access to useful properties such as `num_timestep` and `guidance_scale`. You can modify these properties by updating the underlying attributes. In this example, we disable CFG by setting `pipe._guidance_scale` to be `0`. -* `step_index` and `timestep` tell you where you are in the denoising loop. In our example, we use `step_index` to decide when to turn off CFG. -* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables so please check the pipeline class's `_callback_tensor_inputs` attribute for the list of variables that you can modify. Common variables include `latents` and `prompt_embeds`. In our example, we need to adjust the batch size of `prompt_embeds` after setting `guidance_scale` to be `0` in order for it to work properly. +Now, you can pass the callback function to the `callback_on_step_end` parameter and the `prompt_embeds` to `callback_on_step_end_tensor_inputs`. -You can pass the callback function as `callback_on_step_end` argument to the pipeline along with `callback_on_step_end_tensor_inputs`. - -```python +```py import torch from diffusers import StableDiffusionPipeline @@ -51,10 +54,12 @@ out = pipe(prompt, generator=generator, callback_on_step_end=callback_custom_cfg out.images[0].save("out_custom_cfg.png") ``` -Your callback function will be executed at the end of each denoising step and modify pipeline attributes and tensor variables for the next denoising step. We successfully added the "dynamic CFG" feature to the stable diffusion pipeline without having to modify the code at all. +The callback function is executed at the end of each denoising step, and modifies the pipeline attributes and tensor variables for the next denoising step. + +With callbacks, you can implement features such as dynamic CFG without having to modify the underlying code at all! -Currently we only support `callback_on_step_end`. If you have a solid use case and require a callback function with a different execution point, please open a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&projects=&template=feature_request.md&title=) so we can add it! +🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point! diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 2dbc2604ffce..335df9e6f461 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -728,7 +728,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index baaadefaad3e..7f24bad90f8d 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -780,7 +780,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: Returns: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py index f077b5fffc62..d87aa9ff2d19 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py @@ -181,7 +181,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 2c7caa6214e5..2b8a49976fc9 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -283,7 +283,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: @@ -759,7 +759,7 @@ def __call__( prior_callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in - the `._callback_tensor_inputs` attribute of your pipeine class. + the `._callback_tensor_inputs` attribute of your pipeline class. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, @@ -768,7 +768,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py index 7b5b677be79e..92343e2667e6 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py @@ -255,7 +255,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py index 168209dbf460..66e62303f3f6 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -362,7 +362,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py index 8d0e788b9dd9..83427c68f208 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py @@ -423,7 +423,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 679415db7f3a..46993fa9a065 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -659,7 +659,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index ff5eea2d5584..884110d16225 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -597,7 +597,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 9911cbe7536d..486841d9db99 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -717,7 +717,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 36efb01f23ef..f4debbf422c1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -674,7 +674,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: ```py diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 40daecfa913f..d53ff8d3dce5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -775,7 +775,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: Returns: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f9171d86c2d4..f37dc49c2ae3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -920,7 +920,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: ```py diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index a98f32920e34..daf5478cd249 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -211,7 +211,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 151cbed4e08f..c9e68976ccc9 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -853,7 +853,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index f444eddec0ab..40eabca82b14 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -1005,7 +1005,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 667e7aec00ed..97cd16b5fabb 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -1240,7 +1240,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py index 66730d79da40..ed9ce91cb292 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py @@ -269,7 +269,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py index a21c33b43f92..d4de47ba0c9e 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py @@ -234,7 +234,7 @@ def __call__( prior_callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in - the `._callback_tensor_inputs` attribute of your pipeine class. + the `._callback_tensor_inputs` attribute of your pipeline class. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, @@ -243,7 +243,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py index 9b251cc77d35..8047f159677a 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py @@ -349,7 +349,7 @@ def __call__( callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. + `._callback_tensor_inputs` attribute of your pipeline class. Examples: From 4b45a1e147e4f8fab31b888f50db74b30a1aa105 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Mon, 13 Nov 2023 12:39:30 -0800 Subject: [PATCH 10/13] [docs] Use other checkpoints with inpaint (#5590) * tip about inpaint checkpoints * expand section * feedback --- docs/source/en/using-diffusers/inpaint.md | 222 +++++++++++++++++----- 1 file changed, 177 insertions(+), 45 deletions(-) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 3d03d4e0e4d0..abdfbffb908b 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -184,6 +184,183 @@ make_image_grid([init_image, mask_image, image], rows=1, cols=3) +## Non-inpaint specific checkpoints + +So far, this guide has used inpaint specific checkpoints such as [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting). But you can also use regular checkpoints like [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5). Let's compare the results of the two checkpoints. + +The image on the left is generated from a regular checkpoint, and the image on the right is from an inpaint checkpoint. You'll immediately notice the image on the left is not as clean, and you can still see the outline of the area the model is supposed to inpaint. The image on the right is much cleaner and the inpainted area appears more natural. + + + + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png") + +generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + + + + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png") + +generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + + + + +
+
+ +
runwayml/stable-diffusion-v1-5
+
+
+ +
runwayml/stable-diffusion-inpainting
+
+
+ +However, for more basic tasks like erasing an object from an image (like the rocks in the road for example), a regular checkpoint yields pretty good results. There isn't as noticeable of difference between the regular and inpaint checkpoint. + + + + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/road-mask.png") + +image = pipeline(prompt="road", image=init_image, mask_image=mask_image).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + + + + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/road-mask.png") + +image = pipeline(prompt="road", image=init_image, mask_image=mask_image).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + + + + +
+
+ +
runwayml/stable-diffusion-v1-5
+
+
+ +
runwayml/stable-diffusion-inpainting
+
+
+ +The trade-off of using a non-inpaint specific checkpoint is the overall image quality may be lower, but it generally tends to preserve the mask area (that is why you can see the mask outline). The inpaint specific checkpoints are intentionally trained to generate higher quality inpainted images, and that includes creating a more natural transition between the masked and unmasked areas. As a result, these checkpoints are more likely to change your unmasked area. + +If preserving the unmasked area is important for your task, you can use the code below to force the unmasked area of an image to remain the same at the expense of some more unnatural transitions between the masked and unmasked areas. + +```py +import PIL +import numpy as np +import torch + +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +device = "cuda" +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", + torch_dtype=torch.float16, +) +pipeline = pipeline.to(device) + +img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" +mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + +init_image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) + +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +repainted_image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] +repainted_image.save("repainted_image.png") + +# Convert mask to grayscale NumPy array +mask_image_arr = np.array(mask_image.convert("L")) +# Add a channel dimension to the end of the grayscale mask +mask_image_arr = mask_image_arr[:, :, None] +# Binarize the mask: 1s correspond to the pixels which are repainted +mask_image_arr = mask_image_arr.astype(np.float32) / 255.0 +mask_image_arr[mask_image_arr < 0.5] = 0 +mask_image_arr[mask_image_arr >= 0.5] = 1 + +# Take the masked pixels from the repainted image and the unmasked pixels from the initial image +unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image + mask_image_arr * repainted_image +unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8")) +unmasked_unchanged_image.save("force_unmasked_unchanged.png") +make_image_grid([init_image, mask_image, repainted_image, unmasked_unchanged_image], rows=2, cols=2) +``` + ## Configure pipeline parameters Image features - like quality and "creativity" - are dependent on pipeline parameters. Knowing what these parameters do is important for getting the results you want. Let's take a look at the most important parameters and see how changing them affects the output. @@ -309,51 +486,6 @@ make_image_grid([init_image, mask_image, image], rows=1, cols=3) -## Preserve unmasked areas - -The [`AutoPipelineForInpainting`] (and other inpainting pipelines) generally changes the unmasked parts of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same may result in some unusual transitions between the unmasked and masked areas. - -```py -import PIL -import numpy as np -import torch - -from diffusers import AutoPipelineForInpainting -from diffusers.utils import load_image, make_image_grid - -device = "cuda" -pipeline = AutoPipelineForInpainting.from_pretrained( - "runwayml/stable-diffusion-inpainting", - torch_dtype=torch.float16, -) -pipeline = pipeline.to(device) - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" - -init_image = load_image(img_url).resize((512, 512)) -mask_image = load_image(mask_url).resize((512, 512)) - -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -repainted_image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] -repainted_image.save("repainted_image.png") - -# Convert mask to grayscale NumPy array -mask_image_arr = np.array(mask_image.convert("L")) -# Add a channel dimension to the end of the grayscale mask -mask_image_arr = mask_image_arr[:, :, None] -# Binarize the mask: 1s correspond to the pixels which are repainted -mask_image_arr = mask_image_arr.astype(np.float32) / 255.0 -mask_image_arr[mask_image_arr < 0.5] = 0 -mask_image_arr[mask_image_arr >= 0.5] = 1 - -# Take the masked pixels from the repainted image and the unmasked pixels from the initial image -unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image + mask_image_arr * repainted_image -unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8")) -unmasked_unchanged_image.save("force_unmasked_unchanged.png") -make_image_grid([init_image, mask_image, repainted_image, unmasked_unchanged_image], rows=2, cols=2) -``` - ## Chained inpainting pipelines [`AutoPipelineForInpainting`] can be chained with other 🤗 Diffusers pipelines to edit their outputs. This is often useful for improving the output quality from your other diffusion pipelines, and if you're using multiple pipelines, it can be more memory-efficient to chain them together to keep the outputs in latent space and reuse the same pipeline components. From a359ff764429cb7aaba8b11306b6d99d368538d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Tue, 14 Nov 2023 01:32:59 +0300 Subject: [PATCH 11/13] [`Docs`] Fix typos and update files at API's Main Classes, Models, and Schedulers pages (#5720) * Fix typos, update, add Copyright info, and trim trailing whitespaces * Update docs/source/en/api/loaders.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/api/models/autoencoder_tiny.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/api/models/autoencoder_tiny.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/api/activations.md | 14 +++++++- docs/source/en/api/attnprocessor.md | 12 +++++++ docs/source/en/api/image_processor.md | 6 ++-- .../en/api/internal_classes_overview.md | 12 +++++++ docs/source/en/api/loaders.md | 4 +-- docs/source/en/api/logging.md | 2 +- .../en/api/models/asymmetricautoencoderkl.md | 33 +++++++++++-------- docs/source/en/api/models/autoencoder_tiny.md | 20 ++++++++--- docs/source/en/api/models/autoencoderkl.md | 14 +++++++- docs/source/en/api/models/controlnet.md | 16 +++++++-- docs/source/en/api/models/overview.md | 18 ++++++++-- .../source/en/api/models/prior_transformer.md | 17 ++++++++-- docs/source/en/api/models/transformer2d.md | 12 +++++++ .../en/api/models/transformer_temporal.md | 14 +++++++- docs/source/en/api/models/unet-motion.md | 12 +++++++ docs/source/en/api/models/unet.md | 16 +++++++-- docs/source/en/api/models/unet2d-cond.md | 16 +++++++-- docs/source/en/api/models/unet2d.md | 16 +++++++-- docs/source/en/api/models/unet3d-cond.md | 16 +++++++-- docs/source/en/api/models/vq.md | 14 +++++++- docs/source/en/api/normalization.md | 18 +++++++++- docs/source/en/api/outputs.md | 4 +-- .../api/schedulers/cm_stochastic_iterative.md | 16 +++++++-- docs/source/en/api/schedulers/ddim.md | 14 ++++---- docs/source/en/api/schedulers/ddim_inverse.md | 2 +- docs/source/en/api/schedulers/ddpm.md | 4 +-- docs/source/en/api/schedulers/deis.md | 6 ++-- docs/source/en/api/schedulers/dpm_discrete.md | 2 +- .../api/schedulers/dpm_discrete_ancestral.md | 2 +- docs/source/en/api/schedulers/dpm_sde.md | 2 +- docs/source/en/api/schedulers/euler.md | 2 +- .../en/api/schedulers/euler_ancestral.md | 2 +- docs/source/en/api/schedulers/heun.md | 2 +- docs/source/en/api/schedulers/ipndm.md | 2 +- docs/source/en/api/schedulers/lcm.md | 12 +++++++ docs/source/en/api/schedulers/lms_discrete.md | 2 +- .../en/api/schedulers/multistep_dpm_solver.md | 4 +-- .../multistep_dpm_solver_inverse.md | 4 +-- docs/source/en/api/schedulers/overview.md | 2 +- docs/source/en/api/schedulers/pndm.md | 2 +- docs/source/en/api/schedulers/repaint.md | 4 +-- docs/source/en/api/schedulers/score_sde_ve.md | 4 +-- docs/source/en/api/schedulers/score_sde_vp.md | 2 +- .../api/schedulers/singlestep_dpm_solver.md | 4 +-- .../en/api/schedulers/stochastic_karras_ve.md | 4 +-- docs/source/en/api/schedulers/unipc.md | 8 ++--- docs/source/en/api/schedulers/vq_diffusion.md | 2 +- docs/source/en/api/utilities.md | 14 +++++++- 48 files changed, 337 insertions(+), 93 deletions(-) diff --git a/docs/source/en/api/activations.md b/docs/source/en/api/activations.md index 684238420ce1..e4f4567caca0 100644 --- a/docs/source/en/api/activations.md +++ b/docs/source/en/api/activations.md @@ -1,3 +1,15 @@ + + # Activation functions Customized activation functions for supporting various models in 🤗 Diffusers. @@ -12,4 +24,4 @@ Customized activation functions for supporting various models in 🤗 Diffusers. ## ApproximateGELU -[[autodoc]] models.activations.ApproximateGELU \ No newline at end of file +[[autodoc]] models.activations.ApproximateGELU diff --git a/docs/source/en/api/attnprocessor.md b/docs/source/en/api/attnprocessor.md index 0b11c1f5bc5d..f6ee09f124be 100644 --- a/docs/source/en/api/attnprocessor.md +++ b/docs/source/en/api/attnprocessor.md @@ -1,3 +1,15 @@ + + # Attention Processor An attention processor is a class for applying different types of attention mechanisms. diff --git a/docs/source/en/api/image_processor.md b/docs/source/en/api/image_processor.md index 7fc66f5ee68e..fb446c944c3a 100644 --- a/docs/source/en/api/image_processor.md +++ b/docs/source/en/api/image_processor.md @@ -12,9 +12,9 @@ specific language governing permissions and limitations under the License. # VAE Image Processor -The [`VaeImageProcessor`] provides a unified API for [`StableDiffusionPipeline`]'s to prepare image inputs for VAE encoding and post-processing outputs once they're decoded. This includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays. +The [`VaeImageProcessor`] provides a unified API for [`StableDiffusionPipeline`]s to prepare image inputs for VAE encoding and post-processing outputs once they're decoded. This includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays. -All pipelines with [`VaeImageProcessor`] accepts PIL Image, PyTorch tensor, or NumPy arrays as image inputs and returns outputs based on the `output_type` argument by the user. You can pass encoded image latents directly to the pipeline and return latents from the pipeline as a specific output with the `output_type` argument (for example `output_type="pt"`). This allows you to take the generated latents from one pipeline and pass it to another pipeline as input without leaving the latent space. It also makes it much easier to use multiple pipelines together by passing PyTorch tensors directly between different pipelines. +All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or NumPy arrays as image inputs and return outputs based on the `output_type` argument by the user. You can pass encoded image latents directly to the pipeline and return latents from the pipeline as a specific output with the `output_type` argument (for example `output_type="latent"`). This allows you to take the generated latents from one pipeline and pass it to another pipeline as input without leaving the latent space. It also makes it much easier to use multiple pipelines together by passing PyTorch tensors directly between different pipelines. ## VaeImageProcessor @@ -24,4 +24,4 @@ All pipelines with [`VaeImageProcessor`] accepts PIL Image, PyTorch tensor, or N The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs. -[[autodoc]] image_processor.VaeImageProcessorLDM3D \ No newline at end of file +[[autodoc]] image_processor.VaeImageProcessorLDM3D diff --git a/docs/source/en/api/internal_classes_overview.md b/docs/source/en/api/internal_classes_overview.md index 421a22d5ceb5..5c8d2cc0e387 100644 --- a/docs/source/en/api/internal_classes_overview.md +++ b/docs/source/en/api/internal_classes_overview.md @@ -1,3 +1,15 @@ + + # Overview The APIs in this section are more experimental and prone to breaking changes. Most of them are used internally for development, but they may also be useful to you if you're interested in building a diffusion model with some custom parts or if you're interested in some of our helper utilities for working with 🤗 Diffusers. diff --git a/docs/source/en/api/loaders.md b/docs/source/en/api/loaders.md index 5c7c3ef660ca..d81b0eb1abcb 100644 --- a/docs/source/en/api/loaders.md +++ b/docs/source/en/api/loaders.md @@ -12,11 +12,11 @@ specific language governing permissions and limitations under the License. # Loaders -Adapters (textual inversion, LoRA, hypernetworks) allow you to modify a diffusion model to generate images in a specific style without training or finetuning the entire model. The adapter weights are typically only a tiny fraction of the pretrained model's which making them very portable. 🤗 Diffusers provides an easy-to-use `LoaderMixin` API to load adapter weights. +Adapters (textual inversion, LoRA, hypernetworks) allow you to modify a diffusion model to generate images in a specific style without training or finetuning the entire model. The adapter weights are very portable because they're typically only a tiny fraction of the pretrained model weights. 🤗 Diffusers provides an easy-to-use `LoaderMixin` API to load adapter weights. -🧪 The `LoaderMixins` are highly experimental and prone to future changes. To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`. +🧪 The `LoaderMixin`s are highly experimental and prone to future changes. To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`. diff --git a/docs/source/en/api/logging.md b/docs/source/en/api/logging.md index cc2d012691ac..b31b7c11755e 100644 --- a/docs/source/en/api/logging.md +++ b/docs/source/en/api/logging.md @@ -51,7 +51,7 @@ logger.warning("WARN") All methods of the logging module are documented below. The main methods are [`logging.get_verbosity`] to get the current level of verbosity in the logger and -[`logging.set_verbosity`] to set the verbosity to the level of your choice. +[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order from the least verbose to the most verbose: diff --git a/docs/source/en/api/models/asymmetricautoencoderkl.md b/docs/source/en/api/models/asymmetricautoencoderkl.md index c7b3ee9b5155..1e102943c5e4 100644 --- a/docs/source/en/api/models/asymmetricautoencoderkl.md +++ b/docs/source/en/api/models/asymmetricautoencoderkl.md @@ -1,3 +1,15 @@ + + # AsymmetricAutoencoderKL Improved larger variational autoencoder (VAE) model with KL loss for inpainting task: [Designing a Better Asymmetric VQGAN for StableDiffusion](https://arxiv.org/abs/2306.04632) by Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, Gang Hua. @@ -6,7 +18,7 @@ The abstract from the paper is: *StableDiffusion is a revolutionary text-to-image generator that is causing a stir in the world of image generation and editing. Unlike traditional methods that learn a diffusion model in pixel space, StableDiffusion learns a diffusion model in the latent space via a VQGAN, ensuring both efficiency and quality. It not only supports image generation tasks, but also enables image editing for real images, such as image inpainting and local editing. However, we have observed that the vanilla VQGAN used in StableDiffusion leads to significant information loss, causing distortion artifacts even in non-edited image regions. To this end, we propose a new asymmetric VQGAN with two simple designs. Firstly, in addition to the input from the encoder, the decoder contains a conditional branch that incorporates information from task-specific priors, such as the unmasked image region in inpainting. Secondly, the decoder is much heavier than the encoder, allowing for more detailed recovery while only slightly increasing the total inference cost. The training cost of our asymmetric VQGAN is cheap, and we only need to retrain a new asymmetric decoder while keeping the vanilla VQGAN encoder and StableDiffusion unchanged. Our asymmetric VQGAN can be widely used in StableDiffusion-based inpainting and local editing methods. Extensive experiments demonstrate that it can significantly improve the inpainting and editing performance, while maintaining the original text-to-image capability. The code is available at https://github.com/buxiangzhiren/Asymmetric_VQGAN* -Evaluation results can be found in section 4.1 of the original paper. +Evaluation results can be found in section 4.1 of the original paper. ## Available checkpoints @@ -16,30 +28,23 @@ Evaluation results can be found in section 4.1 of the original paper. ## Example Usage ```python -from io import BytesIO -from PIL import Image -import requests from diffusers import AsymmetricAutoencoderKL, StableDiffusionInpaintPipeline +from diffusers.utils import load_image, make_image_grid -def download_image(url: str) -> Image.Image: - response = requests.get(url) - return Image.open(BytesIO(response.content)).convert("RGB") - - -prompt = "a photo of a person" +prompt = "a photo of a person with beard" img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png" mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png" -image = download_image(img_url).resize((256, 256)) -mask_image = download_image(mask_url).resize((256, 256)) +original_image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") pipe.vae = AsymmetricAutoencoderKL.from_pretrained("cross-attention/asymmetric-autoencoder-kl-x-1-5") pipe.to("cuda") -image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0] -image.save("image.jpeg") +image = pipe(prompt=prompt, image=original_image, mask_image=mask_image).images[0] +make_image_grid([original_image, mask_image, image], rows=1, cols=3) ``` ## AsymmetricAutoencoderKL diff --git a/docs/source/en/api/models/autoencoder_tiny.md b/docs/source/en/api/models/autoencoder_tiny.md index 9b97b6e8e999..1d19539bffe8 100644 --- a/docs/source/en/api/models/autoencoder_tiny.md +++ b/docs/source/en/api/models/autoencoder_tiny.md @@ -1,6 +1,18 @@ + + # Tiny AutoEncoder -Tiny AutoEncoder for Stable Diffusion (TAESD) was introduced in [madebyollin/taesd](https://github.com/madebyollin/taesd) by Ollin Boer Bohan. It is a tiny distilled version of Stable Diffusion's VAE that can quickly decode the latents in a [`StableDiffusionPipeline`] or [`StableDiffusionXLPipeline`] almost instantly. +Tiny AutoEncoder for Stable Diffusion (TAESD) was introduced in [madebyollin/taesd](https://github.com/madebyollin/taesd) by Ollin Boer Bohan. It is a tiny distilled version of Stable Diffusion's VAE that can quickly decode the latents in a [`StableDiffusionPipeline`] or [`StableDiffusionXLPipeline`] almost instantly. To use with Stable Diffusion v-2.1: @@ -16,7 +28,7 @@ pipe = pipe.to("cuda") prompt = "slice of delicious New York-style berry cheesecake" image = pipe(prompt, num_inference_steps=25).images[0] -image.save("cheesecake.png") +image ``` To use with Stable Diffusion XL 1.0 @@ -33,7 +45,7 @@ pipe = pipe.to("cuda") prompt = "slice of delicious New York-style berry cheesecake" image = pipe(prompt, num_inference_steps=25).images[0] -image.save("cheesecake_sdxl.png") +image ``` ## AutoencoderTiny @@ -42,4 +54,4 @@ image.save("cheesecake_sdxl.png") ## AutoencoderTinyOutput -[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput \ No newline at end of file +[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput diff --git a/docs/source/en/api/models/autoencoderkl.md b/docs/source/en/api/models/autoencoderkl.md index bc709c422d36..f42a4d2941dd 100644 --- a/docs/source/en/api/models/autoencoderkl.md +++ b/docs/source/en/api/models/autoencoderkl.md @@ -1,3 +1,15 @@ + + # AutoencoderKL The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images. @@ -14,7 +26,7 @@ from the original format using [`FromOriginalVAEMixin.from_single_file`] as foll ```py from diffusers import AutoencoderKL -url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors" # can also be local file +url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors" # can also be a local file model = AutoencoderKL.from_single_file(url) ``` diff --git a/docs/source/en/api/models/controlnet.md b/docs/source/en/api/models/controlnet.md index 58359723a08e..12bc0110f208 100644 --- a/docs/source/en/api/models/controlnet.md +++ b/docs/source/en/api/models/controlnet.md @@ -1,10 +1,22 @@ + + # ControlNet -The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection. +The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection. The abstract from the paper is: -*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.* +*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.* ## Loading from the original format diff --git a/docs/source/en/api/models/overview.md b/docs/source/en/api/models/overview.md index 9887c6f75741..ab8d9d4e7839 100644 --- a/docs/source/en/api/models/overview.md +++ b/docs/source/en/api/models/overview.md @@ -1,8 +1,20 @@ + + # Models -🤗 Diffusers provides pretrained models for popular algorithms and modules to create custom diffusion systems. The primary function of models is to denoise an input sample as modeled by the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\). +🤗 Diffusers provides pretrained models for popular algorithms and modules to create custom diffusion systems. The primary function of models is to denoise an input sample as modeled by the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\). -All models are built from the base [`ModelMixin`] class which is a [`torch.nn.module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) providing basic functionality for saving and loading models, locally and from the Hugging Face Hub. +All models are built from the base [`ModelMixin`] class which is a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) providing basic functionality for saving and loading models, locally and from the Hugging Face Hub. ## ModelMixin [[autodoc]] ModelMixin @@ -13,4 +25,4 @@ All models are built from the base [`ModelMixin`] class which is a [`torch.nn.mo ## PushToHubMixin -[[autodoc]] utils.PushToHubMixin \ No newline at end of file +[[autodoc]] utils.PushToHubMixin diff --git a/docs/source/en/api/models/prior_transformer.md b/docs/source/en/api/models/prior_transformer.md index 1d2b799ed323..0b849c300662 100644 --- a/docs/source/en/api/models/prior_transformer.md +++ b/docs/source/en/api/models/prior_transformer.md @@ -1,7 +1,18 @@ + + # Prior Transformer -The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents -](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process. +The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process. The abstract from the paper is: @@ -13,4 +24,4 @@ The abstract from the paper is: ## PriorTransformerOutput -[[autodoc]] models.prior_transformer.PriorTransformerOutput \ No newline at end of file +[[autodoc]] models.prior_transformer.PriorTransformerOutput diff --git a/docs/source/en/api/models/transformer2d.md b/docs/source/en/api/models/transformer2d.md index 4ad2b00b6f23..0f891edd754a 100644 --- a/docs/source/en/api/models/transformer2d.md +++ b/docs/source/en/api/models/transformer2d.md @@ -1,3 +1,15 @@ + + # Transformer2D A Transformer model for image-like data from [CompVis](https://huggingface.co/CompVis) that is based on the [Vision Transformer](https://huggingface.co/papers/2010.11929) introduced by Dosovitskiy et al. The [`Transformer2DModel`] accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs. diff --git a/docs/source/en/api/models/transformer_temporal.md b/docs/source/en/api/models/transformer_temporal.md index d67cf717f92b..c936270b7927 100644 --- a/docs/source/en/api/models/transformer_temporal.md +++ b/docs/source/en/api/models/transformer_temporal.md @@ -1,3 +1,15 @@ + + # Transformer Temporal A Transformer model for video-like data. @@ -8,4 +20,4 @@ A Transformer model for video-like data. ## TransformerTemporalModelOutput -[[autodoc]] models.transformer_temporal.TransformerTemporalModelOutput \ No newline at end of file +[[autodoc]] models.transformer_temporal.TransformerTemporalModelOutput diff --git a/docs/source/en/api/models/unet-motion.md b/docs/source/en/api/models/unet-motion.md index 07d4df64c35f..cbc8c30ff64f 100644 --- a/docs/source/en/api/models/unet-motion.md +++ b/docs/source/en/api/models/unet-motion.md @@ -1,3 +1,15 @@ + + # UNetMotionModel The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet model. diff --git a/docs/source/en/api/models/unet.md b/docs/source/en/api/models/unet.md index 9a488a3231a6..66508b469a60 100644 --- a/docs/source/en/api/models/unet.md +++ b/docs/source/en/api/models/unet.md @@ -1,6 +1,18 @@ + + # UNet1DModel -The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 1D UNet model. +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al. for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 1D UNet model. The abstract from the paper is: @@ -10,4 +22,4 @@ The abstract from the paper is: [[autodoc]] UNet1DModel ## UNet1DOutput -[[autodoc]] models.unet_1d.UNet1DOutput \ No newline at end of file +[[autodoc]] models.unet_1d.UNet1DOutput diff --git a/docs/source/en/api/models/unet2d-cond.md b/docs/source/en/api/models/unet2d-cond.md index a669b02a7fe8..ea385ff92426 100644 --- a/docs/source/en/api/models/unet2d-cond.md +++ b/docs/source/en/api/models/unet2d-cond.md @@ -1,6 +1,18 @@ + + # UNet2DConditionModel -The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet conditional model. +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al. for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet conditional model. The abstract from the paper is: @@ -16,4 +28,4 @@ The abstract from the paper is: [[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionModel ## FlaxUNet2DConditionOutput -[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput \ No newline at end of file +[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput diff --git a/docs/source/en/api/models/unet2d.md b/docs/source/en/api/models/unet2d.md index 29e8163f646c..7669d4a5d75a 100644 --- a/docs/source/en/api/models/unet2d.md +++ b/docs/source/en/api/models/unet2d.md @@ -1,6 +1,18 @@ + + # UNet2DModel -The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet model. +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al. for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet model. The abstract from the paper is: @@ -10,4 +22,4 @@ The abstract from the paper is: [[autodoc]] UNet2DModel ## UNet2DOutput -[[autodoc]] models.unet_2d.UNet2DOutput \ No newline at end of file +[[autodoc]] models.unet_2d.UNet2DOutput diff --git a/docs/source/en/api/models/unet3d-cond.md b/docs/source/en/api/models/unet3d-cond.md index 83dbb514c8dd..4eea0a6d1cd2 100644 --- a/docs/source/en/api/models/unet3d-cond.md +++ b/docs/source/en/api/models/unet3d-cond.md @@ -1,6 +1,18 @@ + + # UNet3DConditionModel -The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 3D UNet conditional model. +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al. for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 3D UNet conditional model. The abstract from the paper is: @@ -10,4 +22,4 @@ The abstract from the paper is: [[autodoc]] UNet3DConditionModel ## UNet3DConditionOutput -[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput \ No newline at end of file +[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput diff --git a/docs/source/en/api/models/vq.md b/docs/source/en/api/models/vq.md index cdb6761468a8..c288b163b28f 100644 --- a/docs/source/en/api/models/vq.md +++ b/docs/source/en/api/models/vq.md @@ -1,3 +1,15 @@ + + # VQModel The VQ-VAE model was introduced in [Neural Discrete Representation Learning](https://huggingface.co/papers/1711.00937) by Aaron van den Oord, Oriol Vinyals and Koray Kavukcuoglu. The model is used in 🤗 Diffusers to decode latent representations into images. Unlike [`AutoencoderKL`], the [`VQModel`] works in a quantized latent space. @@ -12,4 +24,4 @@ The abstract from the paper is: ## VQEncoderOutput -[[autodoc]] models.vq_model.VQEncoderOutput \ No newline at end of file +[[autodoc]] models.vq_model.VQEncoderOutput diff --git a/docs/source/en/api/normalization.md b/docs/source/en/api/normalization.md index 7e09976b1565..ccc643ac5e31 100644 --- a/docs/source/en/api/normalization.md +++ b/docs/source/en/api/normalization.md @@ -1,3 +1,15 @@ + + # Normalization layers Customized normalization layers for supporting various models in 🤗 Diffusers. @@ -10,6 +22,10 @@ Customized normalization layers for supporting various models in 🤗 Diffusers. [[autodoc]] models.normalization.AdaLayerNormZero +## AdaLayerNormSingle + +[[autodoc]] models.normalization.AdaLayerNormSingle + ## AdaGroupNorm -[[autodoc]] models.normalization.AdaGroupNorm \ No newline at end of file +[[autodoc]] models.normalization.AdaGroupNorm diff --git a/docs/source/en/api/outputs.md b/docs/source/en/api/outputs.md index ec64d36498ee..30bad5646e91 100644 --- a/docs/source/en/api/outputs.md +++ b/docs/source/en/api/outputs.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # Outputs -All models outputs are subclasses of [`~utils.BaseOutput`], data structures containing all the information returned by the model. The outputs can also be used as tuples or dictionaries. +All model outputs are subclasses of [`~utils.BaseOutput`], data structures containing all the information returned by the model. The outputs can also be used as tuples or dictionaries. For example: @@ -64,4 +64,4 @@ To check a specific pipeline or model output, refer to its corresponding API doc ## ImageTextPipelineOutput -[[autodoc]] ImageTextPipelineOutput \ No newline at end of file +[[autodoc]] ImageTextPipelineOutput diff --git a/docs/source/en/api/schedulers/cm_stochastic_iterative.md b/docs/source/en/api/schedulers/cm_stochastic_iterative.md index a1d5f64036e6..c112c89a12fc 100644 --- a/docs/source/en/api/schedulers/cm_stochastic_iterative.md +++ b/docs/source/en/api/schedulers/cm_stochastic_iterative.md @@ -1,10 +1,22 @@ + + # CMStochasticIterativeScheduler [Consistency Models](https://huggingface.co/papers/2303.01469) by Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever introduced a multistep and onestep scheduler (Algorithm 1) that is capable of generating good samples in one or a small number of steps. The abstract from the paper is: -*Diffusion models have made significant breakthroughs in image, audio, and video generation, but they depend on an iterative generation process that causes slow sampling speed and caps their potential for real-time applications. To overcome this limitation, we propose consistency models, a new family of generative models that achieve high sample quality without adversarial training. They support fast one-step generation by design, while still allowing for few-step sampling to trade compute for sample quality. They also support zero-shot data editing, like image inpainting, colorization, and super-resolution, without requiring explicit training on these tasks. Consistency models can be trained either as a way to distill pre-trained diffusion models, or as standalone generative models. Through extensive experiments, we demonstrate that they outperform existing distillation techniques for diffusion models in one- and few-step generation. For example, we achieve the new state-of-the-art FID of 3.55 on CIFAR-10 and 6.20 on ImageNet 64x64 for one-step generation. When trained as standalone generative models, consistency models also outperform single-step, non-adversarial generative models on standard benchmarks like CIFAR-10, ImageNet 64x64 and LSUN 256x256.* +*Diffusion models have significantly advanced the fields of image, audio, and video generation, but they depend on an iterative sampling process that causes slow generation. To overcome this limitation, we propose consistency models, a new family of models that generate high quality samples by directly mapping noise to data. They support fast one-step generation by design, while still allowing multistep sampling to trade compute for sample quality. They also support zero-shot data editing, such as image inpainting, colorization, and super-resolution, without requiring explicit training on these tasks. Consistency models can be trained either by distilling pre-trained diffusion models, or as standalone generative models altogether. Through extensive experiments, we demonstrate that they outperform existing distillation techniques for diffusion models in one- and few-step sampling, achieving the new state-of-the-art FID of 3.55 on CIFAR-10 and 6.20 on ImageNet 64x64 for one-step generation. When trained in isolation, consistency models become a new family of generative models that can outperform existing one-step, non-adversarial generative models on standard benchmarks such as CIFAR-10, ImageNet 64x64 and LSUN 256x256.* The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models). @@ -12,4 +24,4 @@ The original codebase can be found at [openai/consistency_models](https://github [[autodoc]] CMStochasticIterativeScheduler ## CMStochasticIterativeSchedulerOutput -[[autodoc]] schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput diff --git a/docs/source/en/api/schedulers/ddim.md b/docs/source/en/api/schedulers/ddim.md index c5b79cb95fc9..422b74cff3a9 100644 --- a/docs/source/en/api/schedulers/ddim.md +++ b/docs/source/en/api/schedulers/ddim.md @@ -16,13 +16,11 @@ specific language governing permissions and limitations under the License. The abstract from the paper is: -*Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, -yet they require simulating a Markov chain for many steps to produce a sample. +*Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models -with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. +with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. -We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off -computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.* +We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.* The original codebase of this paper can be found at [ermongroup/ddim](https://github.com/ermongroup/ddim), and you can contact the author on [tsong.me](https://tsong.me/). @@ -57,13 +55,14 @@ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spaci 4. rescale classifier-free guidance to prevent over-exposure ```py -image = pipeline(prompt, guidance_rescale=0.7).images[0] +image = pipe(prompt, guidance_rescale=0.7).images[0] ``` For example: ```py from diffusers import DiffusionPipeline, DDIMScheduler +import torch pipe = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config( @@ -72,7 +71,8 @@ pipe.scheduler = DDIMScheduler.from_config( pipe.to("cuda") prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" -image = pipeline(prompt, guidance_rescale=0.7).images[0] +image = pipe(prompt, guidance_rescale=0.7).images[0] +image ``` ## DDIMScheduler diff --git a/docs/source/en/api/schedulers/ddim_inverse.md b/docs/source/en/api/schedulers/ddim_inverse.md index 52c6d7c8595f..9b28b9dc5950 100644 --- a/docs/source/en/api/schedulers/ddim_inverse.md +++ b/docs/source/en/api/schedulers/ddim_inverse.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. # DDIMInverseScheduler `DDIMInverseScheduler` is the inverted scheduler from [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon. -The implementation is mostly based on the DDIM inversion definition from [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794.pdf). +The implementation is mostly based on the DDIM inversion definition from [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794). ## DDIMInverseScheduler [[autodoc]] DDIMInverseScheduler diff --git a/docs/source/en/api/schedulers/ddpm.md b/docs/source/en/api/schedulers/ddpm.md index c006850e5d44..5402d8863df6 100644 --- a/docs/source/en/api/schedulers/ddpm.md +++ b/docs/source/en/api/schedulers/ddpm.md @@ -16,10 +16,10 @@ specific language governing permissions and limitations under the License. The abstract from the paper is: -*We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.* +*We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at [this https URL](https://github.com/hojonathanho/diffusion).* ## DDPMScheduler [[autodoc]] DDPMScheduler ## DDPMSchedulerOutput -[[autodoc]] schedulers.scheduling_ddpm.DDPMSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_ddpm.DDPMSchedulerOutput diff --git a/docs/source/en/api/schedulers/deis.md b/docs/source/en/api/schedulers/deis.md index 563ede9f0da9..fc05dd39ee61 100644 --- a/docs/source/en/api/schedulers/deis.md +++ b/docs/source/en/api/schedulers/deis.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # DEISMultistepScheduler -Diffusion Exponential Integrator Sampler (DEIS) is proposed in [Fast Sampling of Diffusion Models with Exponential Integrator](https://huggingface.co/papers/2204.13902) by Qinsheng Zhang and Yongxin Chen. `DEISMultistepScheduler` is a fast high order solver for diffusion ordinary differential equations (ODEs). +Diffusion Exponential Integrator Sampler (DEIS) is proposed in [Fast Sampling of Diffusion Models with Exponential Integrator](https://huggingface.co/papers/2204.13902) by Qinsheng Zhang and Yongxin Chen. `DEISMultistepScheduler` is a fast high order solver for diffusion ordinary differential equations (ODEs). This implementation modifies the polynomial fitting formula in log-rho space instead of the original linear `t` space in the DEIS paper. The modification enjoys closed-form coefficients for exponential multistep update instead of replying on the numerical solver. @@ -20,8 +20,6 @@ The abstract from the paper is: *The past few years have witnessed the great success of Diffusion models~(DMs) in generating high-fidelity samples in generative modeling tasks. A major limitation of the DM is its notoriously slow sampling procedure which normally requires hundreds to thousands of time discretization steps of the learned diffusion process to reach the desired accuracy. Our goal is to develop a fast sampling method for DMs with a much less number of steps while retaining high sample quality. To this end, we systematically analyze the sampling procedure in DMs and identify key factors that affect the sample quality, among which the method of discretization is most crucial. By carefully examining the learned diffusion process, we propose Diffusion Exponential Integrator Sampler~(DEIS). It is based on the Exponential Integrator designed for discretizing ordinary differential equations (ODEs) and leverages a semilinear structure of the learned diffusion process to reduce the discretization error. The proposed method can be applied to any DMs and can generate high-fidelity samples in as few as 10 steps. In our experiments, it takes about 3 minutes on one A6000 GPU to generate 50k images from CIFAR10. Moreover, by directly using pre-trained DMs, we achieve the state-of-art sampling performance when the number of score function evaluation~(NFE) is limited, e.g., 4.17 FID with 10 NFEs, 3.37 FID, and 9.74 IS with only 15 NFEs on CIFAR10. Code is available at [this https URL](https://github.com/qsh-zh/deis).* -The original codebase can be found at [qsh-zh/deis](https://github.com/qsh-zh/deis). - ## Tips It is recommended to set `solver_order` to 2 or 3, while `solver_order=1` is equivalent to [`DDIMScheduler`]. @@ -33,4 +31,4 @@ diffusion models, you can set `thresholding=True` to use the dynamic thresholdin [[autodoc]] DEISMultistepScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/dpm_discrete.md b/docs/source/en/api/schedulers/dpm_discrete.md index a8a95a10404f..eea09915c68a 100644 --- a/docs/source/en/api/schedulers/dpm_discrete.md +++ b/docs/source/en/api/schedulers/dpm_discrete.md @@ -20,4 +20,4 @@ The original codebase can be found at [crowsonkb/k-diffusion](https://github.com [[autodoc]] KDPM2DiscreteScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/dpm_discrete_ancestral.md b/docs/source/en/api/schedulers/dpm_discrete_ancestral.md index 61c68f1cb5e2..5f8ae193c5a7 100644 --- a/docs/source/en/api/schedulers/dpm_discrete_ancestral.md +++ b/docs/source/en/api/schedulers/dpm_discrete_ancestral.md @@ -20,4 +20,4 @@ The original codebase can be found at [crowsonkb/k-diffusion](https://github.com [[autodoc]] KDPM2AncestralDiscreteScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/dpm_sde.md b/docs/source/en/api/schedulers/dpm_sde.md index 1eb8b6b6662b..1486ba3d275e 100644 --- a/docs/source/en/api/schedulers/dpm_sde.md +++ b/docs/source/en/api/schedulers/dpm_sde.md @@ -18,4 +18,4 @@ The `DPMSolverSDEScheduler` is inspired by the stochastic sampler from the [Eluc [[autodoc]] DPMSolverSDEScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/euler.md b/docs/source/en/api/schedulers/euler.md index f1b6ed11467a..92743283370d 100644 --- a/docs/source/en/api/schedulers/euler.md +++ b/docs/source/en/api/schedulers/euler.md @@ -19,4 +19,4 @@ The Euler scheduler (Algorithm 2) is from the [Elucidating the Design Space of D [[autodoc]] EulerDiscreteScheduler ## EulerDiscreteSchedulerOutput -[[autodoc]] schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput diff --git a/docs/source/en/api/schedulers/euler_ancestral.md b/docs/source/en/api/schedulers/euler_ancestral.md index f0e817b49bb3..c78a407d2eb2 100644 --- a/docs/source/en/api/schedulers/euler_ancestral.md +++ b/docs/source/en/api/schedulers/euler_ancestral.md @@ -18,4 +18,4 @@ A scheduler that uses ancestral sampling with Euler method steps. This is a fast [[autodoc]] EulerAncestralDiscreteScheduler ## EulerAncestralDiscreteSchedulerOutput -[[autodoc]] schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput diff --git a/docs/source/en/api/schedulers/heun.md b/docs/source/en/api/schedulers/heun.md index 725c1a67f437..abfde24a1678 100644 --- a/docs/source/en/api/schedulers/heun.md +++ b/docs/source/en/api/schedulers/heun.md @@ -18,4 +18,4 @@ The Heun scheduler (Algorithm 1) is from the [Elucidating the Design Space of Di [[autodoc]] HeunDiscreteScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/ipndm.md b/docs/source/en/api/schedulers/ipndm.md index 68a1d58dec3c..b81206493494 100644 --- a/docs/source/en/api/schedulers/ipndm.md +++ b/docs/source/en/api/schedulers/ipndm.md @@ -18,4 +18,4 @@ specific language governing permissions and limitations under the License. [[autodoc]] IPNDMScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/lcm.md b/docs/source/en/api/schedulers/lcm.md index fb55e52ac1f3..5223072fd153 100644 --- a/docs/source/en/api/schedulers/lcm.md +++ b/docs/source/en/api/schedulers/lcm.md @@ -1,3 +1,15 @@ + + # Latent Consistency Model Multistep Scheduler ## Overview diff --git a/docs/source/en/api/schedulers/lms_discrete.md b/docs/source/en/api/schedulers/lms_discrete.md index 5fe90dc4e77e..46d95da5fcd9 100644 --- a/docs/source/en/api/schedulers/lms_discrete.md +++ b/docs/source/en/api/schedulers/lms_discrete.md @@ -18,4 +18,4 @@ specific language governing permissions and limitations under the License. [[autodoc]] LMSDiscreteScheduler ## LMSDiscreteSchedulerOutput -[[autodoc]] schedulers.scheduling_lms_discrete.LMSDiscreteSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_lms_discrete.LMSDiscreteSchedulerOutput diff --git a/docs/source/en/api/schedulers/multistep_dpm_solver.md b/docs/source/en/api/schedulers/multistep_dpm_solver.md index 3dffa54d44a7..ce6bde554463 100644 --- a/docs/source/en/api/schedulers/multistep_dpm_solver.md +++ b/docs/source/en/api/schedulers/multistep_dpm_solver.md @@ -21,7 +21,7 @@ samples, and it can generate quite good samples even in 10 steps. It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling. -Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space +Dynamic thresholding from [Imagen](https://huggingface.co/papers/2205.11487) is supported, and for pixel-space diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic thresholding. This thresholding method is unsuitable for latent-space diffusion models such as Stable Diffusion. @@ -32,4 +32,4 @@ The SDE variant of DPMSolver and DPM-Solver++ is also supported, but only for th [[autodoc]] DPMSolverMultistepScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md b/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md index b63519b41fe6..6a286f3d0ce1 100644 --- a/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md +++ b/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md @@ -14,11 +14,11 @@ specific language governing permissions and limitations under the License. `DPMSolverMultistepInverse` is the inverted scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. -The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794.pdf) and notebook implementation of the [`DiffEdit`] latent inversion from [Xiang-cd/DiffEdit-stable-diffusion](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/blob/main/diffedit.ipynb). +The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794) and notebook implementation of the [`DiffEdit`] latent inversion from [Xiang-cd/DiffEdit-stable-diffusion](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/blob/main/diffedit.ipynb). ## Tips -Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space +Dynamic thresholding from [Imagen](https://huggingface.co/papers/2205.11487) is supported, and for pixel-space diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic thresholding. This thresholding method is unsuitable for latent-space diffusion models such as Stable Diffusion. diff --git a/docs/source/en/api/schedulers/overview.md b/docs/source/en/api/schedulers/overview.md index 20981b7a2ad8..ef17e43e7217 100644 --- a/docs/source/en/api/schedulers/overview.md +++ b/docs/source/en/api/schedulers/overview.md @@ -61,4 +61,4 @@ The different schedulers in this class, depending on the ordinary differential e ## PushToHubMixin -[[autodoc]] utils.PushToHubMixin \ No newline at end of file +[[autodoc]] utils.PushToHubMixin diff --git a/docs/source/en/api/schedulers/pndm.md b/docs/source/en/api/schedulers/pndm.md index bf0e6661e4d1..33717662ae3f 100644 --- a/docs/source/en/api/schedulers/pndm.md +++ b/docs/source/en/api/schedulers/pndm.md @@ -18,4 +18,4 @@ specific language governing permissions and limitations under the License. [[autodoc]] PNDMScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/repaint.md b/docs/source/en/api/schedulers/repaint.md index e68b0021634b..b3910ad71056 100644 --- a/docs/source/en/api/schedulers/repaint.md +++ b/docs/source/en/api/schedulers/repaint.md @@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License. The abstract from the paper is: -*Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks. RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions. Github Repository: git.io/RePaint*. +*Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks. RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions. GitHub Repository: [this http URL](http://git.io/RePaint).* The original implementation can be found at [andreas128/RePaint](https://github.com/andreas128/). @@ -24,4 +24,4 @@ The original implementation can be found at [andreas128/RePaint](https://github. [[autodoc]] RePaintScheduler ## RePaintSchedulerOutput -[[autodoc]] schedulers.scheduling_repaint.RePaintSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_repaint.RePaintSchedulerOutput diff --git a/docs/source/en/api/schedulers/score_sde_ve.md b/docs/source/en/api/schedulers/score_sde_ve.md index 84e077316dc0..5b930f192d93 100644 --- a/docs/source/en/api/schedulers/score_sde_ve.md +++ b/docs/source/en/api/schedulers/score_sde_ve.md @@ -16,10 +16,10 @@ specific language governing permissions and limitations under the License. The abstract from the paper is: -*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model*. +*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.* ## ScoreSdeVeScheduler [[autodoc]] ScoreSdeVeScheduler ## SdeVeOutput -[[autodoc]] schedulers.scheduling_sde_ve.SdeVeOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_sde_ve.SdeVeOutput diff --git a/docs/source/en/api/schedulers/score_sde_vp.md b/docs/source/en/api/schedulers/score_sde_vp.md index 0f70a424841a..204cba877722 100644 --- a/docs/source/en/api/schedulers/score_sde_vp.md +++ b/docs/source/en/api/schedulers/score_sde_vp.md @@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License. The abstract from the paper is: -*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model*. +*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.* diff --git a/docs/source/en/api/schedulers/singlestep_dpm_solver.md b/docs/source/en/api/schedulers/singlestep_dpm_solver.md index b5e1a317e1b1..8962a3e40d9a 100644 --- a/docs/source/en/api/schedulers/singlestep_dpm_solver.md +++ b/docs/source/en/api/schedulers/singlestep_dpm_solver.md @@ -23,7 +23,7 @@ The original implementation can be found at [LuChengTHU/dpm-solver](https://gith It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling. -Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space +Dynamic thresholding from [Imagen](https://huggingface.co/papers/2205.11487) is supported, and for pixel-space diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use dynamic thresholding. This thresholding method is unsuitable for latent-space diffusion models such as Stable Diffusion. @@ -32,4 +32,4 @@ Stable Diffusion. [[autodoc]] DPMSolverSinglestepScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/stochastic_karras_ve.md b/docs/source/en/api/schedulers/stochastic_karras_ve.md index 4e37cce815b3..eb954d7e5e7b 100644 --- a/docs/source/en/api/schedulers/stochastic_karras_ve.md +++ b/docs/source/en/api/schedulers/stochastic_karras_ve.md @@ -12,10 +12,10 @@ specific language governing permissions and limitations under the License. # KarrasVeScheduler -`KarrasVeScheduler` is a stochastic sampler tailored o variance-expanding (VE) models. It is based on the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) and [Score-based generative modeling through stochastic differential equations](https://huggingface.co/papers/2011.13456) papers. +`KarrasVeScheduler` is a stochastic sampler tailored to variance-expanding (VE) models. It is based on the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) and [Score-based generative modeling through stochastic differential equations](https://huggingface.co/papers/2011.13456) papers. ## KarrasVeScheduler [[autodoc]] KarrasVeScheduler ## KarrasVeOutput -[[autodoc]] schedulers.scheduling_karras_ve.KarrasVeOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_karras_ve.KarrasVeOutput diff --git a/docs/source/en/api/schedulers/unipc.md b/docs/source/en/api/schedulers/unipc.md index 56c6fd5bac0d..df514ca4a61c 100644 --- a/docs/source/en/api/schedulers/unipc.md +++ b/docs/source/en/api/schedulers/unipc.md @@ -19,19 +19,17 @@ UniPC is by design model-agnostic, supporting pixel-space/latent-space DPMs on u The abstract from the paper is: -*Diffusion probabilistic models (DPMs) have demonstrated a very promising ability in high-resolution image synthesis. However, sampling from a pre-trained DPM usually requires hundreds of model evaluations, which is computationally expensive. Despite recent progress in designing high-order solvers for DPMs, there still exists room for further speedup, especially in extremely few steps (e.g., 5~10 steps). Inspired by the predictor-corrector for ODE solvers, we develop a unified corrector (UniC) that can be applied after any existing DPM sampler to increase the order of accuracy without extra model evaluations, and derive a unified predictor (UniP) that supports arbitrary order as a byproduct. Combining UniP and UniC, we propose a unified predictor-corrector framework called UniPC for the fast sampling of DPMs, which has a unified analytical form for any order and can significantly improve the sampling quality over previous methods. We evaluate our methods through extensive experiments including both unconditional and conditional sampling using pixel-space and latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional) and 7.51 FID on ImageNet 256times256 (conditional) with only 10 function evaluations. Code is available at https://github.com/wl-zhao/UniPC*. - -The original codebase can be found at [wl-zhao/UniPC](https://github.com/wl-zhao/UniPC). +*Diffusion probabilistic models (DPMs) have demonstrated a very promising ability in high-resolution image synthesis. However, sampling from a pre-trained DPM is time-consuming due to the multiple evaluations of the denoising network, making it more and more important to accelerate the sampling of DPMs. Despite recent progress in designing fast samplers, existing methods still cannot generate satisfying images in many applications where fewer steps (e.g., <10) are favored. In this paper, we develop a unified corrector (UniC) that can be applied after any existing DPM sampler to increase the order of accuracy without extra model evaluations, and derive a unified predictor (UniP) that supports arbitrary order as a byproduct. Combining UniP and UniC, we propose a unified predictor-corrector framework called UniPC for the fast sampling of DPMs, which has a unified analytical form for any order and can significantly improve the sampling quality over previous methods, especially in extremely few steps. We evaluate our methods through extensive experiments including both unconditional and conditional sampling using pixel-space and latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional) and 7.51 FID on ImageNet 256×256 (conditional) with only 10 function evaluations. Code is available at [this https URL](https://github.com/wl-zhao/UniPC).* ## Tips It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling. -Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space +Dynamic thresholding from [Imagen](https://huggingface.co/papers/2205.11487) is supported, and for pixel-space diffusion models, you can set both `predict_x0=True` and `thresholding=True` to use dynamic thresholding. This thresholding method is unsuitable for latent-space diffusion models such as Stable Diffusion. ## UniPCMultistepScheduler [[autodoc]] UniPCMultistepScheduler ## SchedulerOutput -[[autodoc]] schedulers.scheduling_utils.SchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_utils.SchedulerOutput diff --git a/docs/source/en/api/schedulers/vq_diffusion.md b/docs/source/en/api/schedulers/vq_diffusion.md index 5d31a3e3c6ed..09928583f670 100644 --- a/docs/source/en/api/schedulers/vq_diffusion.md +++ b/docs/source/en/api/schedulers/vq_diffusion.md @@ -22,4 +22,4 @@ The abstract from the paper is: [[autodoc]] VQDiffusionScheduler ## VQDiffusionSchedulerOutput -[[autodoc]] schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput \ No newline at end of file +[[autodoc]] schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md index abc38416053a..77ada0834808 100644 --- a/docs/source/en/api/utilities.md +++ b/docs/source/en/api/utilities.md @@ -1,3 +1,15 @@ + + # Utilities Utility and helper functions for working with 🤗 Diffusers. @@ -24,4 +36,4 @@ Utility and helper functions for working with 🤗 Diffusers. ## make_image_grid -[[autodoc]] utils.pil_utils.make_image_grid +[[autodoc]] utils.make_image_grid From 5b231aa38b115c5f7aa4b1754692a9c66c01a5ce Mon Sep 17 00:00:00 2001 From: "Long(Tony) Lian" <1040424979@qq.com> Date: Mon, 13 Nov 2023 18:13:37 -0800 Subject: [PATCH 12/13] Fix the pipeline name in the examples for LMD+ pipeline. Add a colab link to pipeline README. (#5775) * Fix the pipeline name in the examples for LMD+ pipeline * Add LMD+ colab link * Apply code formatting --------- Co-authored-by: Sayak Paul --- examples/community/README.md | 6 +++--- examples/community/llm_grounded_diffusion.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index b9b4f69aff06..3858f25d39ca 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -8,7 +8,7 @@ If a community doesn't work as expected, please open an issue and ping the autho | Example | Description | Code Example | Colab | Author | |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:| -| LLM-grounded Diffusion (LMD+) | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion) | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion) | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) | [Long (Tony) Lian](https://tonylian.com/) | +| LLM-grounded Diffusion (LMD+) | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion) | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion) | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SXzMSeAB-LJYISb2yrUOdypLz4OYWUKj) | [Long (Tony) Lian](https://tonylian.com/) | | CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) | | One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) | | Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) | @@ -74,7 +74,7 @@ from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( "longlian/lmd_plus", - custom_pipeline="llm-grounded-diffusion", + custom_pipeline="llm_grounded_diffusion", variant="fp16", torch_dtype=torch.float16 ) pipe.enable_model_cpu_offload() @@ -108,7 +108,7 @@ from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( "longlian/lmd_plus", - custom_pipeline="llm-grounded-diffusion", + custom_pipeline="llm_grounded_diffusion", variant="fp16", torch_dtype=torch.float16 ) pipe.enable_model_cpu_offload() diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index e767d6068dc2..d47c99bb2990 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -43,7 +43,7 @@ >>> pipe = DiffusionPipeline.from_pretrained( ... "longlian/lmd_plus", - ... custom_pipeline="llm-grounded-diffusion", + ... custom_pipeline="llm_grounded_diffusion", ... variant="fp16", torch_dtype=torch.float16 ... ) >>> pipe.enable_model_cpu_offload() From ed759f0aee721f8520c5bf94d4b7bd7c0ae3dcbb Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 14 Nov 2023 08:34:59 +0530 Subject: [PATCH 13/13] [PixArt-Alpha] Introduce resolution binning (#5739) * feat: add resolution binning Co-authored-by: lawrence-cj * rename * debug * add :test * remove unused variable * set resolution_binning to False. --------- Co-authored-by: lawrence-cj --- .../pixart_alpha/pipeline_pixart_alpha.py | 81 ++++++++++++++++++- tests/pipelines/pixart/test_pixart.py | 9 ++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index 147e2b76e6c6..c3f667ba16be 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -19,6 +19,7 @@ from typing import Callable, List, Optional, Tuple, Union import torch +import torch.nn.functional as F from transformers import T5EncoderModel, T5Tokenizer from ...image_processor import VaeImageProcessor @@ -43,7 +44,6 @@ if is_ftfy_available(): import ftfy - EXAMPLE_DOC_STRING = """ Examples: ```py @@ -60,6 +60,42 @@ ``` """ +ASPECT_RATIO_1024_BIN = { + "0.25": [512.0, 2048.0], + "0.28": [512.0, 1856.0], + "0.32": [576.0, 1792.0], + "0.33": [576.0, 1728.0], + "0.35": [576.0, 1664.0], + "0.4": [640.0, 1600.0], + "0.42": [640.0, 1536.0], + "0.48": [704.0, 1472.0], + "0.5": [704.0, 1408.0], + "0.52": [704.0, 1344.0], + "0.57": [768.0, 1344.0], + "0.6": [768.0, 1280.0], + "0.68": [832.0, 1216.0], + "0.72": [832.0, 1152.0], + "0.78": [896.0, 1152.0], + "0.82": [896.0, 1088.0], + "0.88": [960.0, 1088.0], + "0.94": [960.0, 1024.0], + "1.0": [1024.0, 1024.0], + "1.07": [1024.0, 960.0], + "1.13": [1088.0, 960.0], + "1.21": [1088.0, 896.0], + "1.29": [1152.0, 896.0], + "1.38": [1152.0, 832.0], + "1.46": [1216.0, 832.0], + "1.67": [1280.0, 768.0], + "1.75": [1344.0, 768.0], + "2.0": [1408.0, 704.0], + "2.09": [1472.0, 704.0], + "2.4": [1536.0, 640.0], + "2.5": [1600.0, 640.0], + "3.0": [1728.0, 576.0], + "4.0": [2048.0, 512.0], +} + class PixArtAlphaPipeline(DiffusionPipeline): r""" @@ -495,6 +531,38 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents + @staticmethod + def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]: + """Returns binned height and width.""" + ar = float(height / width) + closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) + default_hw = ratios[closest_ratio] + return int(default_hw[0]), int(default_hw[1]) + + @staticmethod + def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor: + orig_height, orig_width = samples.shape[2], samples.shape[3] + + # Check if resizing is needed + if orig_height != new_height or orig_width != new_width: + ratio = max(new_height / orig_height, new_width / orig_width) + resized_width = int(orig_width * ratio) + resized_height = int(orig_height * ratio) + + # Resize + samples = F.interpolate( + samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False + ) + + # Center Crop + start_x = (resized_width - new_width) // 2 + end_x = start_x + new_width + start_y = (resized_height - new_height) // 2 + end_y = start_y + new_height + samples = samples[:, :, start_y:end_y, start_x:end_x] + + return samples + @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( @@ -518,6 +586,7 @@ def __call__( callback_steps: int = 1, clean_caption: bool = True, mask_feature: bool = True, + use_resolution_binning: bool = True, ) -> Union[ImagePipelineOutput, Tuple]: """ Function invoked when calling the pipeline for generation. @@ -580,6 +649,10 @@ def __call__( be installed. If the dependencies are not installed, the embeddings will be created from the raw prompt. mask_feature (`bool` defaults to `True`): If set to `True`, the text embeddings will be masked. + use_resolution_binning: + (`bool` defaults to `True`): If set to `True`, the requested height and width are first mapped to the + closest resolutions using `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, + they are resized back to the requested resolution. Useful for generating non-square images. Examples: @@ -591,6 +664,10 @@ def __call__( # 1. Check inputs. Raise error if not correct height = height or self.transformer.config.sample_size * self.vae_scale_factor width = width or self.transformer.config.sample_size * self.vae_scale_factor + if use_resolution_binning: + orig_height, orig_width = height, width + height, width = self.classify_height_width_bin(height, width, ratios=ASPECT_RATIO_1024_BIN) + self.check_inputs( prompt, height, width, negative_prompt, callback_steps, prompt_embeds, negative_prompt_embeds ) @@ -709,6 +786,8 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + if use_resolution_binning: + image = self.resize_and_crop_tensor(image, orig_width, orig_height) else: image = latents diff --git a/tests/pipelines/pixart/test_pixart.py b/tests/pipelines/pixart/test_pixart.py index a04f4e1a8804..1fb2560b29b6 100644 --- a/tests/pipelines/pixart/test_pixart.py +++ b/tests/pipelines/pixart/test_pixart.py @@ -89,7 +89,8 @@ def get_dummy_inputs(self, device, seed=0): "generator": generator, "num_inference_steps": 2, "guidance_scale": 5.0, - "output_type": "numpy", + "use_resolution_binning": False, + "output_type": "np", } return inputs @@ -120,6 +121,7 @@ def test_save_load_optional_components(self): "generator": generator, "num_inference_steps": num_inference_steps, "output_type": output_type, + "use_resolution_binning": False, } # set all optional components to None @@ -154,6 +156,7 @@ def test_save_load_optional_components(self): "generator": generator, "num_inference_steps": num_inference_steps, "output_type": output_type, + "use_resolution_binning": False, } output_loaded = pipe_loaded(**inputs)[0] @@ -189,8 +192,8 @@ def test_inference_non_square_images(self): inputs = self.get_dummy_inputs(device) image = pipe(**inputs, height=32, width=48).images image_slice = image[0, -3:, -3:, -1] - self.assertEqual(image.shape, (1, 32, 48, 3)) + expected_slice = np.array([0.3859, 0.2987, 0.2333, 0.5243, 0.6721, 0.4436, 0.5292, 0.5373, 0.4416]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 1e-3) @@ -219,6 +222,7 @@ def test_inference_with_embeddings_and_multiple_images(self): "num_inference_steps": num_inference_steps, "output_type": output_type, "num_images_per_prompt": 2, + "use_resolution_binning": False, } # set all optional components to None @@ -254,6 +258,7 @@ def test_inference_with_embeddings_and_multiple_images(self): "num_inference_steps": num_inference_steps, "output_type": output_type, "num_images_per_prompt": 2, + "use_resolution_binning": False, } output_loaded = pipe_loaded(**inputs)[0]