From 08b453e3828f80027d881bb460716af95e192bcd Mon Sep 17 00:00:00 2001 From: Charchit Sharma Date: Sat, 9 Dec 2023 11:02:55 +0530 Subject: [PATCH 1/4] IP-Adapter for StableDiffusionControlNetImg2ImgPipeline (#5901) * adapter for StableDiffusionControlNetImg2ImgPipeline * fix-copies * fix-copies --------- Co-authored-by: Sayak Paul --- .../controlnet/pipeline_controlnet_img2img.py | 50 ++++++++++++++++--- .../controlnet/test_controlnet_img2img.py | 2 + 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index fa489941c987..037641bd820e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -19,10 +19,10 @@ import PIL.Image import torch import torch.nn.functional as F -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -130,7 +130,7 @@ def prepare_image(image): class StableDiffusionControlNetImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): r""" Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance. @@ -140,7 +140,7 @@ class StableDiffusionControlNetImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. @@ -166,7 +166,7 @@ class StableDiffusionControlNetImg2ImgPipeline( """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] @@ -180,6 +180,7 @@ def __init__( scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, ): super().__init__() @@ -212,6 +213,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) @@ -468,6 +470,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -861,6 +888,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -922,6 +950,7 @@ def __call__( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1053,6 +1082,11 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + if ip_adapter_image is not None: + image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt) + if self.do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 4. Prepare image image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) @@ -1111,7 +1145,10 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 7.1 Create tensor stating which controlnets to keep + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + + # 7.2 Create tensor stating which controlnets to keep controlnet_keep = [] for i in range(len(timesteps)): keeps = [ @@ -1171,6 +1208,7 @@ def __call__( cross_attention_kwargs=self.cross_attention_kwargs, down_block_additional_residuals=down_block_res_samples, mid_block_additional_residual=mid_block_res_sample, + added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 5a7f70eb488a..b4b67e6476f6 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -134,6 +134,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components @@ -273,6 +274,7 @@ def init_weights(m): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components From 88bdd97ccd1f823510b9ed4e7f0ce5a44a0d2136 Mon Sep 17 00:00:00 2001 From: Aryan V S Date: Sun, 10 Dec 2023 21:19:14 +0530 Subject: [PATCH 2/4] IP adapter support for most pipelines (#5900) * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py * update tests * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py * support ip-adapter in src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py * support ip-adapter in src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py * support ip-adapter in src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py * support ip-adapter in src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py * revert changes to sd_attend_and_excite and sd_upscale * make style * fix broken tests * update ip-adapter implementation to latest * apply suggestions from review --------- Co-authored-by: YiYi Xu Co-authored-by: Sayak Paul --- .../pipeline_latent_consistency_img2img.py | 51 +++++++++++-- .../pipeline_latent_consistency_text2img.py | 53 ++++++++++++-- ...eline_stable_diffusion_instruct_pix2pix.py | 64 ++++++++++++++-- .../pipeline_stable_diffusion_ldm3d.py | 55 ++++++++++++-- .../pipeline_stable_diffusion_panorama.py | 60 +++++++++++++-- .../pipeline_stable_diffusion_sag.py | 73 +++++++++++++++++-- .../pipeline_stable_diffusion_safe.py | 61 ++++++++++++++-- .../test_latent_consistency_models.py | 1 + .../test_latent_consistency_models_img2img.py | 1 + ...st_stable_diffusion_instruction_pix2pix.py | 1 + .../test_stable_diffusion_ldm3d.py | 1 + .../test_stable_diffusion_panorama.py | 1 + .../test_stable_diffusion_sag.py | 1 + 13 files changed, 380 insertions(+), 43 deletions(-) diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index ed29a939388f..63a54f5aa666 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -20,11 +20,11 @@ import PIL.Image import torch -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import LCMScheduler from ...utils import ( @@ -129,7 +129,7 @@ def retrieve_timesteps( class LatentConsistencyModelImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for image-to-image generation using a latent consistency model. @@ -142,6 +142,7 @@ class LatentConsistencyModelImg2ImgPipeline( - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -166,7 +167,7 @@ class LatentConsistencyModelImg2ImgPipeline( """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"] @@ -179,6 +180,7 @@ def __init__( scheduler: LCMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -191,6 +193,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) if safety_checker is None and requires_safety_checker: @@ -449,6 +452,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -647,6 +675,7 @@ def __call__( generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -695,6 +724,8 @@ def __call__( prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -758,6 +789,12 @@ def __call__( device = self._execution_device # do_classifier_free_guidance = guidance_scale > 1.0 + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + # 3. Encode input prompt lora_scale = ( self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None @@ -815,6 +852,9 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None) + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 8. LCM Multistep Sampling Loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order self._num_timesteps = len(timesteps) @@ -829,6 +869,7 @@ def __call__( timestep_cond=w_embedding, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index c8f1d647c15b..54d5a2ec989d 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -19,11 +19,11 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import LCMScheduler from ...utils import ( @@ -107,7 +107,7 @@ def retrieve_timesteps( class LatentConsistencyModelPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using a latent consistency model. @@ -120,6 +120,7 @@ class LatentConsistencyModelPipeline( - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -144,7 +145,7 @@ class LatentConsistencyModelPipeline( """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"] @@ -157,6 +158,7 @@ def __init__( scheduler: LCMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -185,6 +187,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -433,6 +436,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -581,6 +609,7 @@ def __call__( generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -629,6 +658,8 @@ def __call__( prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -697,6 +728,12 @@ def __call__( device = self._execution_device # do_classifier_free_guidance = guidance_scale > 1.0 + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + # 3. Encode input prompt lora_scale = ( self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None @@ -748,6 +785,9 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None) + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 8. LCM MultiStep Sampling Loop: num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order self._num_timesteps = len(timesteps) @@ -762,6 +802,7 @@ def __call__( timestep_cond=w_embedding, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index d922803858b0..b0021c5a3e63 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -18,11 +18,11 @@ import numpy as np import PIL.Image import torch -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging from ...utils.torch_utils import randn_tensor @@ -72,7 +72,9 @@ def retrieve_latents( raise AttributeError("Could not access latents of provided encoder_output") -class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionInstructPix2PixPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin +): r""" Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion). @@ -83,6 +85,7 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -105,7 +108,7 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] _callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"] @@ -118,6 +121,7 @@ def __init__( scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -146,6 +150,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -166,6 +171,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -213,6 +219,8 @@ def __call__( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -293,6 +301,16 @@ def __call__( self._guidance_scale = guidance_scale self._image_guidance_scale = image_guidance_scale + device = self._execution_device + + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if self.do_classifier_free_guidance: + image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds]) + if image is None: raise ValueError("`image` input cannot be undefined.") @@ -367,6 +385,9 @@ def __call__( # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 8.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 9. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order self._num_timesteps = len(timesteps) @@ -383,7 +404,11 @@ def __call__( # predict the noise residual noise_pred = self.unet( - scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False + scaled_latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, )[0] # Hack: @@ -598,11 +623,36 @@ def _encode_prompt( # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds] + # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds] prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]) return prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py index f410c08a3bbe..ee9335a2bb01 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py @@ -19,11 +19,11 @@ import numpy as np import PIL.Image import torch -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import VaeImageProcessorLDM3D -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D +from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -82,7 +82,7 @@ class LDM3DPipelineOutput(BaseOutput): class StableDiffusionLDM3DPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image and 3D generation using LDM3D. @@ -95,6 +95,7 @@ class StableDiffusionLDM3DPipeline( - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -117,7 +118,7 @@ class StableDiffusionLDM3DPipeline( """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] def __init__( @@ -129,6 +130,7 @@ def __init__( scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection], requires_safety_checker: bool = True, ): super().__init__() @@ -157,6 +159,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor) @@ -410,6 +413,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: has_nsfw_concept = None @@ -529,6 +557,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -573,6 +602,8 @@ def __call__( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -622,6 +653,14 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 3. Encode input prompt prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, @@ -659,6 +698,9 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -673,6 +715,7 @@ def __call__( t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index ff6a66ab57c9..bcc063499459 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -16,11 +16,11 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import DDIMScheduler from ...utils import ( @@ -59,13 +59,19 @@ """ -class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin): r""" Pipeline for text-to-image generation using MultiDiffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular device, etc.). + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. @@ -87,7 +93,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] def __init__( @@ -99,6 +105,7 @@ def __init__( scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -127,6 +134,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -363,6 +371,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -529,6 +562,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -578,6 +612,8 @@ def __call__( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -632,6 +668,14 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 3. Encode input prompt text_encoder_lora_scale = ( cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None @@ -681,6 +725,9 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 8. Denoising loop # Each denoising step also includes refinement of the latents with respect to the # views. @@ -743,6 +790,7 @@ def __call__( t, encoder_hidden_states=prompt_embeds_input, cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, ).sample # perform guidance diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 68652e977c5d..792a3c40b33d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -17,11 +17,11 @@ import torch import torch.nn.functional as F -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -98,13 +98,17 @@ def __call__( # Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input -class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular device, etc.). + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. @@ -126,7 +130,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin) """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] def __init__( @@ -138,6 +142,7 @@ def __init__( scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -150,6 +155,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -386,6 +392,31 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -519,6 +550,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -565,6 +597,8 @@ def __call__( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -618,6 +652,14 @@ def __call__( # `sag_scale = 0` means no self-attention guidance do_self_attention_guidance = sag_scale > 0.0 + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 3. Encode input prompt prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, @@ -655,6 +697,10 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + added_uncond_kwargs = {"image_embeds": negative_image_embeds} if ip_adapter_image is not None else None + # 7. Denoising loop store_processor = CrossAttnStoreProcessor() self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor @@ -680,6 +726,7 @@ def get_map_size(module, input, output): t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, ).sample # perform guidance @@ -703,7 +750,12 @@ def get_map_size(module, input, output): ) uncond_emb, _ = prompt_embeds.chunk(2) # forward and give guidance - degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample + degraded_pred = self.unet( + degraded_latents, + t, + encoder_hidden_states=uncond_emb, + added_cond_kwargs=added_uncond_kwargs, + ).sample noise_pred += sag_scale * (noise_pred_uncond - degraded_pred) else: # DDIM-like prediction of x0 @@ -715,7 +767,12 @@ def get_map_size(module, input, output): pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t) ) # forward and give guidance - degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample + degraded_pred = self.unet( + degraded_latents, + t, + encoder_hidden_states=prompt_embeds, + added_cond_kwargs=added_cond_kwargs, + ).sample noise_pred += sag_scale * (noise_pred - degraded_pred) # compute the previous noisy sample x_t -> x_t-1 diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index eb24cbfd947b..fdc7844a7e08 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -5,10 +5,12 @@ import numpy as np import torch from packaging import version -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...configuration_utils import FrozenDict -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput +from ...loaders import IPAdapterMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor @@ -20,13 +22,16 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class StableDiffusionPipelineSafe(DiffusionPipeline): +class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin): r""" Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular device, etc.). + The pipeline also inherits the following loading methods: + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. @@ -48,7 +53,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline): """ model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["safety_checker", "feature_extractor"] + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] def __init__( self, @@ -59,6 +64,7 @@ def __init__( scheduler: KarrasDiffusionSchedulers, safety_checker: SafeStableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, requires_safety_checker: bool = True, ): super().__init__() @@ -140,6 +146,7 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, + image_encoder=image_encoder, ) self._safety_text_concept = safety_concept self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) @@ -467,6 +474,31 @@ def perform_safety_guidance( noise_guidance = noise_guidance - noise_guidance_safety return noise_guidance, safety_momentum + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + @torch.no_grad() def __call__( self, @@ -480,6 +512,7 @@ def __call__( eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -521,6 +554,8 @@ def __call__( Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -588,6 +623,17 @@ def __call__( if not enable_safety_guidance: warnings.warn("Safety checker disabled!") + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if do_classifier_free_guidance: + if enable_safety_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds, image_embeds]) + else: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 3. Encode input prompt prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance @@ -613,6 +659,9 @@ def __call__( # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + safety_momentum = None num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order @@ -627,7 +676,9 @@ def __call__( latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs + ).sample # perform guidance if do_classifier_free_guidance: diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py index 174d9b6de9f8..5d33b45c0973 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py @@ -87,6 +87,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, "requires_safety_checker": False, } return components diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py index f9410ffe640a..5b4e2b191f53 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py @@ -97,6 +97,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, "requires_safety_checker": False, } return components diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 69b36cb3bb8a..461bd14a43d8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -108,6 +108,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py index e7e98c52d92c..cfdb0c57bbdb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py @@ -93,6 +93,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 657608df8b98..ad86a53f5ee9 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -91,6 +91,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index 6eae1ce4d371..8b789408f5cb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -93,6 +93,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, + "image_encoder": None, } return components From 664e931bcbddd47c21c3d4d547221506f588b9e9 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Mon, 11 Dec 2023 03:52:04 -0600 Subject: [PATCH 3/4] Correct type annotation for `VaeImageProcessor.numpy_to_pil` (#6111) From `(np.ndarray) -> PIL.Image.Image` to `(np.ndarray) -> List[PIL.Image.Image]`. --- src/diffusers/image_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index a515805fd087..ab96384fe9f1 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -88,7 +88,7 @@ def __init__( self.config.do_convert_rgb = False @staticmethod - def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image: + def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]: """ Convert a numpy image or a batch of images to a PIL image. """ From 0a401b95b7f298b3d029576e1d65d99f06ed1228 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Mon, 11 Dec 2023 21:55:28 +0300 Subject: [PATCH 4/4] [`Docs`] Fix typos (#6122) Fix typos and trim trailing whitespaces --- README.md | 4 +- examples/README.md | 10 +- examples/community/README.md | 176 +++++++++--------- examples/research_projects/README.md | 4 +- .../README.md | 10 +- .../multi_token_clip.py | 0 .../requirements.txt | 0 .../requirements_flax.txt | 0 .../textual_inversion.py | 0 .../textual_inversion_flax.py | 0 .../research_projects/onnxruntime/README.md | 2 +- .../onnxruntime/text_to_image/README.md | 4 +- .../onnxruntime/textual_inversion/README.md | 8 +- examples/research_projects/realfill/README.md | 2 +- .../research_projects/sdxl_flax/README.md | 18 +- examples/t2i_adapter/README_sdxl.md | 4 +- examples/text_to_image/README.md | 30 +-- examples/textual_inversion/README.md | 12 +- .../unconditional_image_generation/README.md | 6 +- examples/wuerstchen/text_to_image/README.md | 2 +- src/diffusers/pipelines/README.md | 52 +++--- ...inous_encoder.py => continuous_encoder.py} | 0 .../pipeline_spectrogram_diffusion.py | 2 +- .../pipelines/stable_diffusion/README.md | 22 +-- 24 files changed, 183 insertions(+), 185 deletions(-) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/README.md (97%) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/multi_token_clip.py (100%) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/requirements.txt (100%) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/requirements_flax.txt (100%) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/textual_inversion.py (100%) rename examples/research_projects/{mulit_token_textual_inversion => multi_token_textual_inversion}/textual_inversion_flax.py (100%) rename src/diffusers/pipelines/spectrogram_diffusion/{continous_encoder.py => continuous_encoder.py} (100%) diff --git a/README.md b/README.md index 489e0d154af2..b52813d7e1a7 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi ## Quickstart -Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 15000+ checkpoints): +Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 16000+ checkpoints): ```python from diffusers import DiffusionPipeline @@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.** @@ -310,7 +310,7 @@ import torch pipe = DiffusionPipeline.from_pretrained( 'hakurei/waifu-diffusion', custom_pipeline="lpw_stable_diffusion", - + torch_dtype=torch.float16 ) pipe=pipe.to("cuda") @@ -377,7 +377,7 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained( custom_pipeline="speech_to_image_diffusion", speech_model=model, speech_processor=processor, - + torch_dtype=torch.float16, ) @@ -435,7 +435,7 @@ import torch pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="wildcard_stable_diffusion", - + torch_dtype=torch.float16, ) prompt = "__animal__ sitting on a __object__ wearing a __clothing__" @@ -449,7 +449,7 @@ out = pipe( ) ``` -### Composable Stable diffusion +### Composable Stable diffusion [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) proposes conjunction and negation (negative prompts) operators for compositional generation with conditional diffusion models. @@ -499,7 +499,7 @@ tvu.save_image(grid, f'{prompt}_{args.weights}' + '.png') ``` ### Imagic Stable Diffusion -Allows you to edit an image using stable diffusion. +Allows you to edit an image using stable diffusion. ```python import requests @@ -539,7 +539,7 @@ image = res.images[0] image.save('./imagic/imagic_image_alpha_2.png') ``` -### Seed Resizing +### Seed Resizing Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline. ```python @@ -667,14 +667,14 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained( detection_pipeline=language_detection_pipeline, translation_model=trans_model, translation_tokenizer=trans_tokenizer, - + torch_dtype=torch.float16, ) diffuser_pipeline.enable_attention_slicing() diffuser_pipeline = diffuser_pipeline.to(device) -prompt = ["a photograph of an astronaut riding a horse", +prompt = ["a photograph of an astronaut riding a horse", "Una casa en la playa", "Ein Hund, der Orange isst", "Un restaurant parisien"] @@ -715,7 +715,7 @@ mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512)) pipe = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", custom_pipeline="img2img_inpainting", - + torch_dtype=torch.float16 ) pipe = pipe.to("cuda") @@ -758,8 +758,8 @@ prompt = "a cup" # the masked out region will be replaced with this image = pipe(image=image, text=text, prompt=prompt).images[0] ``` -### Bit Diffusion -Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this: +### Bit Diffusion +Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this: ```python from diffusers import DiffusionPipeline @@ -837,8 +837,8 @@ Usage:- ```python from diffusers import DiffusionPipeline -#Return a CheckpointMergerPipeline class that allows you to merge checkpoints. -#The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to +#Return a CheckpointMergerPipeline class that allows you to merge checkpoints. +#The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to #merge for convenience pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger") @@ -861,16 +861,16 @@ image = merged_pipe(prompt).images[0] ``` Some examples along with the merge details: -1. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" ; Sigmoid interpolation; alpha = 0.8 +1. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" ; Sigmoid interpolation; alpha = 0.8 ![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stability_v1_4_waifu_sig_0.8.png) -2. "hakurei/waifu-diffusion" + "prompthero/openjourney" ; Inverse Sigmoid interpolation; alpha = 0.8 +2. "hakurei/waifu-diffusion" + "prompthero/openjourney" ; Inverse Sigmoid interpolation; alpha = 0.8 ![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png) -3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5 +3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5 ![Stable plus Waifu plus openjourney add_diff 0.5](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stable_waifu_openjourney_add_diff_0.5.png) @@ -937,8 +937,8 @@ pipe = DiffusionPipeline.from_pretrained( img = Image.open('phone.jpg') mix_img = pipe( - img, - prompt = 'bed', + img, + prompt = 'bed', kmin = 0.3, kmax = 0.5, mix_factor = 0.5, @@ -1049,7 +1049,7 @@ print(pipeline.prior_scheduler) ### UnCLIP Text Interpolation Pipeline -This Diffusion Pipeline takes two prompts and interpolates between the two input prompts using spherical interpolation ( slerp ). The input prompts are converted to text embeddings by the pipeline's text_encoder and the interpolation is done on the resulting text_embeddings over the number of steps specified. Defaults to 5 steps. +This Diffusion Pipeline takes two prompts and interpolates between the two input prompts using spherical interpolation ( slerp ). The input prompts are converted to text embeddings by the pipeline's text_encoder and the interpolation is done on the resulting text_embeddings over the number of steps specified. Defaults to 5 steps. ```python import torch @@ -1086,7 +1086,7 @@ The resulting images in order:- ### UnCLIP Image Interpolation Pipeline -This Diffusion Pipeline takes two images or an image_embeddings tensor of size 2 and interpolates between their embeddings using spherical interpolation ( slerp ). The input images/image_embeddings are converted to image embeddings by the pipeline's image_encoder and the interpolation is done on the resulting image_embeddings over the number of steps specified. Defaults to 5 steps. +This Diffusion Pipeline takes two images or an image_embeddings tensor of size 2 and interpolates between their embeddings using spherical interpolation ( slerp ). The input images/image_embeddings are converted to image embeddings by the pipeline's image_encoder and the interpolation is done on the resulting image_embeddings over the number of steps specified. Defaults to 5 steps. ```python import torch @@ -1127,8 +1127,8 @@ The resulting images in order:- ![result5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_5.png) ### DDIM Noise Comparative Analysis Pipeline -#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?** -The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution. +#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?** +The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution. The approach consists of the following steps: 1. The input is an image x0. @@ -1170,7 +1170,7 @@ Here is the result of this pipeline (which is DDIM) on CelebA-HQ dataset. ### CLIP Guided Img2Img Stable Diffusion -CLIP guided Img2Img stable diffusion can help to generate more realistic images with an initial image +CLIP guided Img2Img stable diffusion can help to generate more realistic images with an initial image by guiding stable diffusion at every denoising step with an additional CLIP model. The following code requires roughly 12GB of GPU RAM. @@ -1322,8 +1322,8 @@ target_prompt = "A golden retriever" # run the pipeline result_image = pipeline( - base_prompt=base_prompt, - target_prompt=target_prompt, + base_prompt=base_prompt, + target_prompt=target_prompt, image=cropped_image, ) @@ -1537,7 +1537,7 @@ python -m pip install intel_extension_for_pytorch== -f https://dev pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex") # For Float32 pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference -# For BFloat16 +# For BFloat16 pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference ``` @@ -1545,7 +1545,7 @@ Then you can use the ipex pipeline in a similar way to the default stable diffus ```python # For Float32 image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' -# For BFloat16 +# For BFloat16 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' ``` @@ -1604,21 +1604,21 @@ latency = elapsed_time(pipe4) print("Latency of StableDiffusionPipeline--fp32",latency) ``` - + ### CLIP Guided Images Mixing With Stable Diffusion ![clip_guided_images_mixing_examples](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/main.png) -CLIP guided stable diffusion images mixing pipeline allows to combine two images using standard diffusion models. -This approach is using (optional) CoCa model to avoid writing image description. +CLIP guided stable diffusion images mixing pipeline allows to combine two images using standard diffusion models. +This approach is using (optional) CoCa model to avoid writing image description. [More code examples](https://github.com/TheDenk/images_mixing) ### Stable Diffusion XL Long Weighted Prompt Pipeline -This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style. +This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style. -You can provide both `prompt` and `prompt_2`. if only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. +You can provide both `prompt` and `prompt_2`. if only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. ```python from diffusers import DiffusionPipeline @@ -1639,8 +1639,8 @@ neg_prompt = "blur, low quality, carton, animate" pipe.to("cuda") images = pipe( - prompt = prompt - , negative_prompt = neg_prompt + prompt = prompt + , negative_prompt = neg_prompt ).images[0] pipe.to("cpu") @@ -1648,7 +1648,7 @@ torch.cuda.empty_cache() images ``` -In the above code, the `prompt2` is appended to the `prompt`, which is more than 77 tokens. "birds" are showing up in the result. +In the above code, the `prompt2` is appended to the `prompt`, which is more than 77 tokens. "birds" are showing up in the result. ![Stable Diffusion XL Long Weighted Prompt Pipeline sample](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_long_weighted_prompt.png) ## Example Images Mixing (with CoCa) @@ -1700,7 +1700,7 @@ mixing_pipeline.enable_attention_slicing() mixing_pipeline = mixing_pipeline.to("cuda") # Pipeline running -generator = torch.Generator(device="cuda").manual_seed(17) +generator = torch.Generator(device="cuda").manual_seed(17) def download_image(url): response = requests.get(url) @@ -1729,7 +1729,7 @@ pipe_images = mixing_pipeline( ### Stable Diffusion Mixture Tiling This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. - + ```python from diffusers import LMSDiscreteScheduler, DiffusionPipeline @@ -1802,7 +1802,7 @@ image.save('tensorrt_inpaint_mecha_robot.png') ### Stable Diffusion Mixture Canvas This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. - + ```python from PIL import Image from diffusers import LMSDiscreteScheduler, DiffusionPipeline @@ -2011,7 +2011,7 @@ Reference Image ![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png) -Output Image +Output Image `prompt: 1 girl` @@ -2022,7 +2022,7 @@ Reference Image ![reference_image](https://github.com/huggingface/diffusers/assets/34944964/449bdab6-e744-4fb2-9620-d4068d9a741b) -Output Image +Output Image `prompt: A dog` @@ -2103,7 +2103,7 @@ Let's have a look at the images (*512X512*) | Without Feedback | With Feedback (1st image) | |---------------------|---------------------| -| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | +| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | ### Masked Im2Im Stable Diffusion Pipeline @@ -2256,7 +2256,7 @@ pipe.to(torch_device="cuda", torch_dtype=torch.float32) prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps. -num_inference_steps = 4 +num_inference_steps = 4 images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images ``` @@ -2292,7 +2292,7 @@ input_image=Image.open("myimg.png") strength = 0.5 #strength =0 (no change) strength=1 (completely overwrite image) # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps. -num_inference_steps = 4 +num_inference_steps = 4 images = pipe(prompt=prompt, image=input_image, strength=strength, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images ``` @@ -2345,7 +2345,7 @@ assert len(images) == (len(prompts) - 1) * num_interpolation_steps ``` ### StableDiffusionUpscaleLDM3D Pipeline -[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. +[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. The abstract from the paper is: *Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods* @@ -2386,8 +2386,8 @@ upscaled_depth.save(f"upscaled_lemons_depth.png") ''' ### ControlNet + T2I Adapter Pipeline -This pipelines combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once. -It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale = 0` or `controlnet_conditioning_scale = 0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively. +This pipelines combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once. +It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale = 0` or `controlnet_conditioning_scale = 0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively. ```py import cv2 @@ -2538,7 +2538,7 @@ pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae rp_args = { "mode":"rows", "div": "1;1;1" -} +} prompt =""" green hair twintail BREAK @@ -2567,7 +2567,7 @@ for image in images: ### Cols, Rows mode In the Cols, Rows mode, you can split the screen vertically and horizontally and assign prompts to each region. The split ratio can be specified by 'div', and you can set the division ratio like '3;3;2' or '0.1;0.5'. Furthermore, as will be described later, you can also subdivide the split Cols, Rows to specify more complex regions. -In this image, the image is divided into three parts, and a separate prompt is applied to each. The prompts are divided by 'BREAK', and each is applied to the respective region. +In this image, the image is divided into three parts, and a separate prompt is applied to each. The prompts are divided by 'BREAK', and each is applied to the respective region. ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline2.png) ``` green hair twintail BREAK @@ -2625,7 +2625,7 @@ prompt =""" a girl in street with shirt, tie, skirt BREAK red, shirt BREAK green, tie BREAK -blue , skirt +blue , skirt """ ``` ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline3.png) @@ -2644,7 +2644,7 @@ If only one input is given for multiple regions, they are all assumed to be the The difference is that in Prompt, duplicate regions are added, whereas in Prompt-EX, duplicate regions are overwritten sequentially. Since they are processed in order, setting a TARGET with a large regions first makes it easier for the effect of small regions to remain unmuffled. ### Accuracy -In the case of a 512 x 512 image, Attention mode reduces the size of the region to about 8 x 8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact. +In the case of a 512 x 512 image, Attention mode reduces the size of the region to about 8 x 8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact. ``` girl hair twintail frills,ribbons, dress, face BREAK girl, ,face @@ -2675,13 +2675,13 @@ Negative prompts are equally effective across all regions, but it is possible to To activate Regional Prompter, it is necessary to enter settings in rp_args. The items that can be set are as follows. rp_args is a dictionary type. ### Input Parameters -Parameters are specified through the `rp_arg`(dictionary type). +Parameters are specified through the `rp_arg`(dictionary type). ``` rp_args = { "mode":"rows", "div": "1;1;1" -} +} pipe(prompt =prompt, rp_args = rp_args) ``` @@ -2760,7 +2760,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur def get_kernel(self): return self.k - + self.kernel_size = kernel_size self.conv = Blurkernel(blur_type='gaussian', kernel_size=kernel_size, @@ -2835,7 +2835,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur * ![sample](https://github.com/tongdaxu/Images/assets/22267548/0ceb5575-d42e-4f0b-99c0-50e69c982209) * The reconstruction is perceptually similar to the source image, but different in details. * In dps_pipeline.py, we also provide a super-resolution example, which should produce: - * Downsampled image: + * Downsampled image: * ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13) * Reconstructed image: * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f) @@ -2930,7 +2930,7 @@ The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion). - `show_image` (`bool`, defaults to False): Determine whether to show intermediate results during generation. -``` +```py from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( @@ -2945,24 +2945,24 @@ prompt = "Envision a portrait of an elderly woman, her face a canvas of time, fr negative_prompt = "blurry, ugly, duplicate, poorly drawn, deformed, mosaic" images = pipe( - prompt, + prompt, negative_prompt=negative_prompt, - height=3072, - width=3072, - view_batch_size=16, + height=3072, + width=3072, + view_batch_size=16, stride=64, - num_inference_steps=50, + num_inference_steps=50, guidance_scale=7.5, - cosine_scale_1=3, - cosine_scale_2=1, - cosine_scale_3=1, + cosine_scale_1=3, + cosine_scale_2=1, + cosine_scale_3=1, sigma=0.8, - multi_decoder=True, + multi_decoder=True, show_image=True ) ``` You can display and save the generated images as: -``` +```py def image_grid(imgs, save_path=None): w = 0 @@ -2980,7 +2980,7 @@ def image_grid(imgs, save_path=None): if save_path != None: img.save(save_path + "/img_{}.jpg".format((i + 1) * 1024)) w += w_ - + return grid image_grid(images, save_path="./outputs/") diff --git a/examples/research_projects/README.md b/examples/research_projects/README.md index ef50d423e68f..f1716eea8d61 100644 --- a/examples/research_projects/README.md +++ b/examples/research_projects/README.md @@ -1,7 +1,7 @@ # Research projects -This folder contains various research projects using 🧨 Diffusers. -They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder. +This folder contains various research projects using 🧨 Diffusers. +They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work. To use any of them, just run the command diff --git a/examples/research_projects/mulit_token_textual_inversion/README.md b/examples/research_projects/multi_token_textual_inversion/README.md similarity index 97% rename from examples/research_projects/mulit_token_textual_inversion/README.md rename to examples/research_projects/multi_token_textual_inversion/README.md index 1303f73c1756..97a71b41c7ea 100644 --- a/examples/research_projects/mulit_token_textual_inversion/README.md +++ b/examples/research_projects/multi_token_textual_inversion/README.md @@ -1,6 +1,6 @@ ## [Deprecated] Multi Token Textual Inversion -**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the officail textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).** +**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the official textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).** The author of this project is [Isamu Isozaki](https://github.com/isamu-isozaki) - please make sure to tag the author for issue and PRs as well as @patrickvonplaten. @@ -17,9 +17,9 @@ Feel free to add these options to your training! In practice num_vec_per_token a [Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples. The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion. -## Running on Colab +## Running on Colab -Colab for training +Colab for training [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) Colab for inference @@ -53,7 +53,7 @@ accelerate config ### Cat toy example -You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. +You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens). @@ -63,7 +63,7 @@ Run the following command to authenticate your token huggingface-cli login ``` -If you have already cloned the repo, then you won't need to go through these steps. +If you have already cloned the repo, then you won't need to go through these steps.
diff --git a/examples/research_projects/mulit_token_textual_inversion/multi_token_clip.py b/examples/research_projects/multi_token_textual_inversion/multi_token_clip.py similarity index 100% rename from examples/research_projects/mulit_token_textual_inversion/multi_token_clip.py rename to examples/research_projects/multi_token_textual_inversion/multi_token_clip.py diff --git a/examples/research_projects/mulit_token_textual_inversion/requirements.txt b/examples/research_projects/multi_token_textual_inversion/requirements.txt similarity index 100% rename from examples/research_projects/mulit_token_textual_inversion/requirements.txt rename to examples/research_projects/multi_token_textual_inversion/requirements.txt diff --git a/examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt b/examples/research_projects/multi_token_textual_inversion/requirements_flax.txt similarity index 100% rename from examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt rename to examples/research_projects/multi_token_textual_inversion/requirements_flax.txt diff --git a/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py b/examples/research_projects/multi_token_textual_inversion/textual_inversion.py similarity index 100% rename from examples/research_projects/mulit_token_textual_inversion/textual_inversion.py rename to examples/research_projects/multi_token_textual_inversion/textual_inversion.py diff --git a/examples/research_projects/mulit_token_textual_inversion/textual_inversion_flax.py b/examples/research_projects/multi_token_textual_inversion/textual_inversion_flax.py similarity index 100% rename from examples/research_projects/mulit_token_textual_inversion/textual_inversion_flax.py rename to examples/research_projects/multi_token_textual_inversion/textual_inversion_flax.py diff --git a/examples/research_projects/onnxruntime/README.md b/examples/research_projects/onnxruntime/README.md index 204d9c951c99..a7935189f61b 100644 --- a/examples/research_projects/onnxruntime/README.md +++ b/examples/research_projects/onnxruntime/README.md @@ -2,4 +2,4 @@ **This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.** -This aims to provide diffusers examples with ONNXRuntime optimizations for training/fine-tuning unconditional image generation, text to image, and textual inversion. Please see individual directories for more details on how to run each task using ONNXRuntime. \ No newline at end of file +This aims to provide diffusers examples with ONNXRuntime optimizations for training/fine-tuning unconditional image generation, text to image, and textual inversion. Please see individual directories for more details on how to run each task using ONNXRuntime. diff --git a/examples/research_projects/onnxruntime/text_to_image/README.md b/examples/research_projects/onnxruntime/text_to_image/README.md index cd9397939ac2..48bce2065444 100644 --- a/examples/research_projects/onnxruntime/text_to_image/README.md +++ b/examples/research_projects/onnxruntime/text_to_image/README.md @@ -34,7 +34,7 @@ accelerate config ### Pokemon example -You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. +You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens). @@ -68,7 +68,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --learning_rate=1e-05 \ --max_grad_norm=1 \ --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" ``` Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions. \ No newline at end of file diff --git a/examples/research_projects/onnxruntime/textual_inversion/README.md b/examples/research_projects/onnxruntime/textual_inversion/README.md index 9f08983eaaad..261c4b49eeaf 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/README.md +++ b/examples/research_projects/onnxruntime/textual_inversion/README.md @@ -3,9 +3,9 @@ [Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples. The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion. -## Running on Colab +## Running on Colab -Colab for training +Colab for training [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) Colab for inference @@ -39,7 +39,7 @@ accelerate config ### Cat toy example -You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. +You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens). @@ -49,7 +49,7 @@ Run the following command to authenticate your token huggingface-cli login ``` -If you have already cloned the repo, then you won't need to go through these steps. +If you have already cloned the repo, then you won't need to go through these steps.
diff --git a/examples/research_projects/realfill/README.md b/examples/research_projects/realfill/README.md index b70f425368e5..91821031d2e0 100644 --- a/examples/research_projects/realfill/README.md +++ b/examples/research_projects/realfill/README.md @@ -35,7 +35,7 @@ from accelerate.utils import write_basic_config write_basic_config() ``` -When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. +When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. ### Toy example diff --git a/examples/research_projects/sdxl_flax/README.md b/examples/research_projects/sdxl_flax/README.md index 612fdf1edd43..252b7b596ec0 100644 --- a/examples/research_projects/sdxl_flax/README.md +++ b/examples/research_projects/sdxl_flax/README.md @@ -72,8 +72,8 @@ params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params) params["scheduler"] = scheduler_state ``` This section adjusts the data types of the model parameters. -We convert all parameters to `bfloat16` to speed-up the computation with model weights. -**Note** that the scheduler parameters are **not** converted to `blfoat16` as the loss +We convert all parameters to `bfloat16` to speed-up the computation with model weights. +**Note** that the scheduler parameters are **not** converted to `blfoat16` as the loss in precision is degrading the pipeline's performance too significantly. **3. Define Inputs to Pipeline** @@ -146,12 +146,12 @@ For this we will be using a JAX feature called [Ahead of Time](https://jax.readt In [sdxl_single_aot.py](./sdxl_single_aot.py) we give a simple example of how to write our own parallelization logic for text-to-image generation pipeline in JAX using [StabilityAI's Stable Diffusion XL](stabilityai/stable-diffusion-xl-base-1.0) -We add a `aot_compile` function that compiles the `pipeline._generate` function +We add a `aot_compile` function that compiles the `pipeline._generate` function telling JAX which input arguments are static, that is, arguments that -are known at compile time and won't change. In our case, it is num_inference_steps, +are known at compile time and won't change. In our case, it is num_inference_steps, height, width and return_latents. -Once the function is compiled, these parameters are omitted from future calls and +Once the function is compiled, these parameters are omitted from future calls and cannot be changed without modifying the code and recompiling. ```python @@ -205,9 +205,9 @@ def generate( g = jnp.array([guidance_scale] * prompt_ids.shape[0], dtype=jnp.float32) g = g[:, None] images = p_generate( - prompt_ids, - p_params, - rng, + prompt_ids, + p_params, + rng, g, None, neg_prompt_ids) @@ -220,7 +220,7 @@ def generate( The first forward pass after AOT compilation still takes a while longer than subsequent passes, this is because on the first pass, JAX uses Python dispatch, which Fills the C++ dispatch cache. -When using jit, this extra step is done automatically, but when using AOT compilation, +When using jit, this extra step is done automatically, but when using AOT compilation, it doesn't happen until the function call is made. ```python diff --git a/examples/t2i_adapter/README_sdxl.md b/examples/t2i_adapter/README_sdxl.md index d583341c367f..1e5a19fedad1 100644 --- a/examples/t2i_adapter/README_sdxl.md +++ b/examples/t2i_adapter/README_sdxl.md @@ -42,7 +42,7 @@ from accelerate.utils import write_basic_config write_basic_config() ``` -When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. +When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. ## Circle filling dataset @@ -85,7 +85,7 @@ accelerate launch train_t2i_adapter_sdxl.py \ To better track our training experiments, we're using the following flags in the command above: * `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`. -* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. +* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. Our experiments were conducted on a single 40GB A100 GPU. diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index e2cbaca2a9d8..94fd63f33067 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -36,7 +36,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha ### Pokemon example -You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. +You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens). @@ -71,7 +71,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --learning_rate=1e-05 \ --max_grad_norm=1 \ --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" ``` @@ -145,11 +145,11 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ --train_batch_size=1 \ --gradient_accumulation_steps=4 \ --gradient_checkpointing \ - --max_train_steps=15000 \ + --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" ``` @@ -157,7 +157,7 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence by rebalancing the loss. In order to use it, one needs to set the `--snr_gamma` argument. The recommended -value when using it is 5.0. +value when using it is 5.0. You can find [this project on Weights and Biases](https://wandb.ai/sayakpaul/text2image-finetune-minsnr) that compares the loss surfaces of the following setups: @@ -167,7 +167,7 @@ You can find [this project on Weights and Biases](https://wandb.ai/sayakpaul/tex For our small Pokemons dataset, the effects of Min-SNR weighting strategy might not appear to be pronounced, but for larger datasets, we believe the effects will be more pronounced. -Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds. +Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds. ## Training with LoRA @@ -186,7 +186,7 @@ on consumer GPUs like Tesla T4, Tesla V100. ### Training -First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** @@ -197,7 +197,7 @@ export MODEL_NAME="CompVis/stable-diffusion-v1-4" export DATASET_NAME="lambdalabs/pokemon-blip-captions" ``` -For this example we want to directly store the trained LoRA embeddings on the Hub, so +For this example we want to directly store the trained LoRA embeddings on the Hub, so we need to be logged in and add the `--push_to_hub` flag. ```bash @@ -225,11 +225,11 @@ The above command will also run inference as fine-tuning progresses and log the The final LoRA embedding weights have been uploaded to [sayakpaul/sd-model-finetuned-lora-t4](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4). **___Note: [The final weights](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin) are only 3 MB in size, which is orders of magnitudes smaller than the original model.___** -You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw). +You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw). ### Inference -Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights. You +Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights. You need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-pokemon-model-lora`. ```python @@ -248,9 +248,9 @@ image.save("pokemon.png") If you are loading the LoRA parameters from the Hub and if the Hub repository has a `base_model` tag (such as [this](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/README.md?code=true#L4)), then -you can do: +you can do: -```py +```py from huggingface_hub.repocard import RepoCard lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4" @@ -287,7 +287,7 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" ``` To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata). @@ -321,5 +321,5 @@ According to [this issue](https://github.com/huggingface/diffusers/issues/2234#i ## Stable Diffusion XL -* We support fine-tuning the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) via the `train_text_to_image_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md). -* We also support fine-tuning of the UNet and Text Encoder shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with LoRA via the `train_text_to_image_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md). +* We support fine-tuning the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) via the `train_text_to_image_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md). +* We also support fine-tuning of the UNet and Text Encoder shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with LoRA via the `train_text_to_image_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md). diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md index 0a1d8a459fc6..0a2723f0982f 100644 --- a/examples/textual_inversion/README.md +++ b/examples/textual_inversion/README.md @@ -3,9 +3,9 @@ [Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples. The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion. -## Running on Colab +## Running on Colab -Colab for training +Colab for training [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) Colab for inference @@ -84,11 +84,11 @@ accelerate launch textual_inversion.py \ A full training run takes ~1 hour on one V100 GPU. -**Note**: As described in [the official paper](https://arxiv.org/abs/2208.01618) +**Note**: As described in [the official paper](https://arxiv.org/abs/2208.01618) only one embedding vector is used for the placeholder token, *e.g.* `""`. -However, one can also add multiple embedding vectors for the placeholder token -to increase the number of fine-tuneable parameters. This can help the model to learn -more complex details. To use multiple embedding vectors, you should define `--num_vectors` +However, one can also add multiple embedding vectors for the placeholder token +to increase the number of fine-tuneable parameters. This can help the model to learn +more complex details. To use multiple embedding vectors, you should define `--num_vectors` to a number larger than one, *e.g.*: ```bash --num_vectors 5 diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md index d83dc928c7a1..2990b3abf3f5 100644 --- a/examples/unconditional_image_generation/README.md +++ b/examples/unconditional_image_generation/README.md @@ -27,7 +27,7 @@ And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) e accelerate config ``` -### Unconditional Flowers +### Unconditional Flowers The command to train a DDPM UNet model on the Oxford Flowers dataset: @@ -52,7 +52,7 @@ A full training run takes 2 hours on 4xV100 GPUs. -### Unconditional Pokemon +### Unconditional Pokemon The command to train a DDPM UNet model on the Pokemon dataset: @@ -96,7 +96,7 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \ --logger="wandb" ``` -To be able to use Weights and Biases (`wandb`) as a logger you need to install the library: `pip install wandb`. +To be able to use Weights and Biases (`wandb`) as a logger you need to install the library: `pip install wandb`. ### Using your own data diff --git a/examples/wuerstchen/text_to_image/README.md b/examples/wuerstchen/text_to_image/README.md index 5378e3ef5253..8b2040b4ca7f 100644 --- a/examples/wuerstchen/text_to_image/README.md +++ b/examples/wuerstchen/text_to_image/README.md @@ -72,7 +72,7 @@ In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-de ### Prior Training -First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). ```bash export DATASET_NAME="lambdalabs/pokemon-blip-captions" diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md index 7562040596e9..d5125ae5caf2 100644 --- a/src/diffusers/pipelines/README.md +++ b/src/diffusers/pipelines/README.md @@ -1,33 +1,33 @@ # 🧨 Diffusers Pipelines Pipelines provide a simple way to run state-of-the-art diffusion models in inference. -Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler +Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler components - all of which are needed to have a functioning end-to-end diffusion system. As an example, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion) has three independently trained models: - [Autoencoder](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/vae.py#L392) - [Conditional Unet](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/unet_2d_condition.py#L12) - [CLIP text encoder](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPTextModel) -- a scheduler component, [scheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py), +- a scheduler component, [scheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py), - a [CLIPImageProcessor](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPImageProcessor), - as well as a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py). -All of these components are necessary to run stable diffusion in inference even though they were trained +All of these components are necessary to run stable diffusion in inference even though they were trained or created independently from each other. -To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. +To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. More specifically, we strive to provide pipelines that - 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)), -- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), +- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), - 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)), - 4. can easily be contributed by the community (see the [Contribution](#contribution) section). -**Note** that pipelines do not (and should not) offer any training functionality. +**Note** that pipelines do not (and should not) offer any training functionality. If you are looking for *official* training examples, please have a look at [examples](https://github.com/huggingface/diffusers/tree/main/examples). ## Pipelines Summary -The following table summarizes all officially supported pipelines, their corresponding paper, and if +The following table summarizes all officially supported pipelines, their corresponding paper, and if available a colab notebook to directly try them out. | Pipeline | Source | Tasks | Colab @@ -35,35 +35,35 @@ available a colab notebook to directly try them out. | [dance diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator) | *Unconditional Audio Generation* | | [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | *Unconditional Image Generation* | | [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | *Unconditional Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) -| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Text-to-Image Generation* | -| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Unconditional Image Generation* | -| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | *Unconditional Image Generation* | -| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | -| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | +| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Text-to-Image Generation* | +| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Unconditional Image Generation* | +| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | *Unconditional Image Generation* | +| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | +| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | | [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) -| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | *Unconditional Image Generation* | +| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | *Unconditional Image Generation* | -**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. +**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below. ## Pipelines API -Diffusion models often consist of multiple independently-trained models or other previously existing components. +Diffusion models often consist of multiple independently-trained models or other previously existing components. -Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one. +Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one. During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality: - [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.* -"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be +"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be loaded into the pipelines. More specifically, for each model/component one needs to define the format `: ["", ""]`. `` is the attribute name given to the loaded instance of `` which can be found in the library or pipeline folder called `""`. -- [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`. -In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated +- [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`. +In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated from the local path. - [`to`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L118) which accepts a `string` or `torch.device` to move all models that are of type `torch.nn.Module` to the passed device. The behavior is fully analogous to [PyTorch's `to` method](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to). -- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for +- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for each pipeline, one should look directly into the respective pipeline. **Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should @@ -71,12 +71,12 @@ not be used for training. If you want to store the gradients during the forward ## Contribution -We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire +We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire all of our pipelines to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**. -- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L56) or be directly attached to the model and scheduler components of the pipeline. -- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and -use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most +- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L56) or be directly attached to the model and scheduler components of the pipeline. +- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and +use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method. - **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines) would be even better. - **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*. @@ -93,8 +93,8 @@ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] - +image = pipe(prompt).images[0] + image.save("astronaut_rides_horse.png") ``` diff --git a/src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py similarity index 100% rename from src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py rename to src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 93af3b1189d0..88725af452c2 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -29,7 +29,7 @@ from ..onnx_utils import OnnxRuntimeModel from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline -from .continous_encoder import SpectrogramContEncoder +from .continuous_encoder import SpectrogramContEncoder from .notes_encoder import SpectrogramNotesEncoder diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md index 66df9a811afb..5b6424308f02 100644 --- a/src/diffusers/pipelines/stable_diffusion/README.md +++ b/src/diffusers/pipelines/stable_diffusion/README.md @@ -6,13 +6,13 @@ Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stabili The summary of the model is the following: -*Stable Diffusion is a text-to-image model that will empower billions of people to create stunning art within seconds. It is a breakthrough in speed and quality meaning that it can run on consumer GPUs. You can see some of the amazing output that has been created by this model without pre or post-processing on this page. The model itself builds upon the work of the team at CompVis and Runway in their widely used latent diffusion model combined with insights from the conditional diffusion models by our lead generative AI developer Katherine Crowson, Dall-E 2 by Open AI, Imagen by Google Brain and many others. We are delighted that AI media generation is a cooperative field and hope it can continue this way to bring the gift of creativity to all.* +*Stable Diffusion is a text-to-image model that will empower billions of people to create stunning art within seconds. It is a breakthrough in speed and quality meaning that it can run on consumer GPUs. You can see some of the amazing output that has been created by this model without pre or post-processing on this page. The model itself builds upon the work of the team at CompVis and Runway in their widely used latent diffusion model combined with insights from the conditional diffusion models by our lead generative AI developer Katherine Crowson, Dall-E 2 by Open AI, Imagen by Google Brain and many others. We are delighted that AI media generation is a cooperative field and hope it can continue this way to bring the gift of creativity to all.* ## Tips: - Stable Diffusion has the same architecture as [Latent Diffusion](https://arxiv.org/abs/2112.10752) but uses a frozen CLIP Text Encoder instead of training the text encoder jointly with the diffusion model. - An in-detail explanation of the Stable Diffusion model can be found under [Stable Diffusion with 🧨 Diffusers](https://huggingface.co/blog/stable_diffusion). -- If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can +- If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can download the weights with `git lfs install; git clone https://huggingface.co/runwayml/stable-diffusion-v1-5` and instead pass the local path to the cloned folder to `from_pretrained` as shown below. - Stable Diffusion can work with a variety of different samplers as is shown below. @@ -28,7 +28,7 @@ download the weights with `git lfs install; git clone https://huggingface.co/run ### Using Stable Diffusion without being logged into the Hub. -If you want to download the model weights using a single Python line, you need to be logged in via `huggingface-cli login`. +If you want to download the model weights using a single Python line, you need to be logged in via `huggingface-cli login`. ```python from diffusers import DiffusionPipeline @@ -61,8 +61,8 @@ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] - +image = pipe(prompt).images[0] + image.save("astronaut_rides_horse.png") ``` @@ -75,13 +75,13 @@ from diffusers import StableDiffusionPipeline, DDIMScheduler scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", + "runwayml/stable-diffusion-v1-5", scheduler=scheduler, ).to("cuda") prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] - +image = pipe(prompt).images[0] + image.save("astronaut_rides_horse.png") ``` @@ -94,13 +94,13 @@ from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler lms = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", + "runwayml/stable-diffusion-v1-5", scheduler=lms, ).to("cuda") prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] - +image = pipe(prompt).images[0] + image.save("astronaut_rides_horse.png") ```