From adcbe674a49af31415a1131174143d58c0e68d7d Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 1 Feb 2024 09:51:02 -1000 Subject: [PATCH 01/43] [refactor]Scheduler.set_begin_index (#6728) --- .../controlnet/pipeline_controlnet_img2img.py | 2 + .../controlnet/pipeline_controlnet_inpaint.py | 2 + .../pipeline_controlnet_sd_xl_img2img.py | 2 + .../pipeline_cycle_diffusion.py | 2 + ...ipeline_stable_diffusion_inpaint_legacy.py | 2 + .../pipeline_latent_consistency_img2img.py | 2 + src/diffusers/pipelines/pia/pipeline_pia.py | 2 + .../pipeline_stable_diffusion_depth2img.py | 2 + .../pipeline_stable_diffusion_img2img.py | 2 + .../pipeline_stable_diffusion_inpaint.py | 2 + .../pipeline_stable_diffusion_diffedit.py | 2 + .../pipeline_text_to_video_synth_img2img.py | 2 + .../scheduling_consistency_models.py | 59 +++++++++---- .../schedulers/scheduling_deis_multistep.py | 59 +++++++++---- .../scheduling_dpmsolver_multistep.py | 56 +++++++++---- .../scheduling_dpmsolver_multistep_inverse.py | 2 - .../schedulers/scheduling_dpmsolver_sde.py | 64 ++++++++------- .../scheduling_dpmsolver_singlestep.py | 59 +++++++++---- .../scheduling_euler_ancestral_discrete.py | 52 +++++++++--- .../schedulers/scheduling_euler_discrete.py | 49 ++++++++--- .../schedulers/scheduling_heun_discrete.py | 64 ++++++++------- src/diffusers/schedulers/scheduling_ipndm.py | 46 ++++++++--- .../scheduling_k_dpm_2_ancestral_discrete.py | 82 ++++++++++--------- .../schedulers/scheduling_k_dpm_2_discrete.py | 82 ++++++++++--------- src/diffusers/schedulers/scheduling_lcm.py | 46 ++++++++--- .../schedulers/scheduling_lms_discrete.py | 52 +++++++++--- .../schedulers/scheduling_sasolver.py | 44 ++++++++-- .../schedulers/scheduling_unipc_multistep.py | 59 +++++++++---- 28 files changed, 620 insertions(+), 279 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index f5e4775900de..846da6c76d59 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -789,6 +789,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index bc6133c8b2d1..bc985beae69d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -705,6 +705,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index dda2f207b90a..ca6b5165fefb 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -871,6 +871,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index 9d2b3ca8abaf..da2f4ba9b6e9 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -566,6 +566,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index 4daa1c07f0c6..449b6d88b9de 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -536,6 +536,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 4146a35fb909..509b5ab34bde 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -634,6 +634,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 565544a0fef4..fda56088b916 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -906,6 +906,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index e431fee7bdb0..440a972ff8e0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -467,6 +467,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index f78cd383b83a..a9b04b493c7e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -659,6 +659,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 5d77341511a3..111a70aa5c09 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -859,6 +859,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index d0d132555e69..82e91e3565ea 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -754,6 +754,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index c781e490caae..342a81b81a2e 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -554,6 +554,8 @@ def get_timesteps(self, num_inference_steps, strength, device): t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index e7e0dcbdc31e..9b672a74fc26 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -98,15 +98,9 @@ def __init__( self.custom_timesteps = False self.is_scale_input_called = False self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def index_for_timestep(self, timestep, schedule_timesteps=None): - if schedule_timesteps is None: - schedule_timesteps = self.timesteps - - indices = (schedule_timesteps == timestep).nonzero() - return indices.item() - @property def step_index(self): """ @@ -114,6 +108,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -231,6 +243,7 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device=device) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Modified _convert_to_karras implementation that takes in ramp as argument @@ -280,23 +293,29 @@ def get_scalings_for_boundary_condition(self, sigma): c_out = (sigma - sigma_min) * sigma_data / (sigma**2 + sigma_data**2) ** 0.5 return c_skip, c_out - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() - self._step_index = step_index.item() + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -412,7 +431,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index e8bd5f8f68d4..a0831d80f71b 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -187,6 +187,7 @@ def __init__( self.model_outputs = [None] * solver_order self.lower_order_nums = 0 self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -196,6 +197,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -255,6 +274,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic # add an index counter for schedulers that allow duplicated timesteps self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample @@ -620,11 +640,12 @@ def ind_fn(t, b, c, d): else: raise NotImplementedError("only support log-rho multistep deis now") - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + index_candidates = (schedule_timesteps == timestep).nonzero() if len(index_candidates) == 0: step_index = len(self.timesteps) - 1 @@ -637,7 +658,20 @@ def _init_step_index(self, timestep): else: step_index = index_candidates[0].item() - self._step_index = step_index + return step_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index + def _init_step_index(self, timestep): + """ + Initialize the step_index counter for the scheduler. + """ + + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -736,16 +770,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [] - for timestep in timesteps: - index_candidates = (schedule_timesteps == timestep).nonzero() - if len(index_candidates) == 0: - step_index = len(schedule_timesteps) - 1 - elif len(index_candidates) > 1: - step_index = index_candidates[1].item() - else: - step_index = index_candidates[0].item() - step_indices.append(step_index) + # begin_index is None when the scheduler is used for training + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index d70d4eec9b3e..bfb0d943ee2c 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -227,6 +227,7 @@ def __init__( self.model_outputs = [None] * solver_order self.lower_order_nums = 0 self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -236,6 +237,23 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -311,6 +329,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # add an index counter for schedulers that allow duplicated timesteps self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample @@ -792,11 +811,11 @@ def multistep_dpm_solver_third_order_update( ) return x_t - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + index_candidates = (schedule_timesteps == timestep).nonzero() if len(index_candidates) == 0: step_index = len(self.timesteps) - 1 @@ -809,7 +828,19 @@ def _init_step_index(self, timestep): else: step_index = index_candidates[0].item() - self._step_index = step_index + return step_index + + def _init_step_index(self, timestep): + """ + Initialize the step_index counter for the scheduler. + """ + + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -920,16 +951,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [] - for timestep in timesteps: - index_candidates = (schedule_timesteps == timestep).nonzero() - if len(index_candidates) == 0: - step_index = len(schedule_timesteps) - 1 - elif len(index_candidates) > 1: - step_index = index_candidates[1].item() - else: - step_index = index_candidates[0].item() - step_indices.append(step_index) + # begin_index is None when the scheduler is used for training + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 03fc3677d07f..089cfc0d988f 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -767,7 +767,6 @@ def multistep_dpm_solver_third_order_update( ) return x_t - # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index def _init_step_index(self, timestep): if isinstance(timestep, torch.Tensor): timestep = timestep.to(self.timesteps.device) @@ -879,7 +878,6 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch """ return sample - # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 20c294f95bd6..c51cd3f440a3 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -13,7 +13,6 @@ # limitations under the License. import math -from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -198,9 +197,10 @@ def __init__( self.noise_sampler = None self.noise_sampler_seed = noise_sampler_seed self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: schedule_timesteps = self.timesteps @@ -211,31 +211,18 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(self._index_counter) == 0: - pos = 1 if len(indices) > 1 else 0 - else: - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - pos = self._index_counter[timestep_int] + pos = 1 if len(indices) > 1 else 0 return indices[pos].item() # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) - - index_candidates = (self.timesteps == timestep).nonzero() - - # The sigma index that is taken for the **very** first `step` - # is always the second index (or the last index if there is only 1) - # This way we can ensure we don't accidentally skip a sigma in - # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) else: - step_index = index_candidates[0] - - self._step_index = step_index.item() + self._step_index = self._begin_index @property def init_noise_sigma(self): @@ -252,6 +239,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, @@ -348,13 +353,10 @@ def set_timesteps( self.mid_point_sigma = None self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication self.noise_sampler = None - # for exp beta schedules, such as the one for `pipeline_shap_e.py` - # we need an index counter - self._index_counter = defaultdict(int) - def _second_order_timesteps(self, sigmas, log_sigmas): def sigma_fn(_t): return np.exp(-_t) @@ -444,10 +446,6 @@ def step( if self.step_index is None: self._init_step_index(timestep) - # advance index counter by 1 - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - self._index_counter[timestep_int] += 1 - # Create a noise sampler if it hasn't been created yet if self.noise_sampler is None: min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max() @@ -527,7 +525,7 @@ def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: return SchedulerOutput(prev_sample=prev_sample) - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.add_noise + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, @@ -544,7 +542,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index f664374a4238..e22085da74f5 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -210,6 +210,7 @@ def __init__( self.sample = None self.order_list = self.get_order_list(num_train_timesteps) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication def get_order_list(self, num_inference_steps: int) -> List[int]: @@ -253,6 +254,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -315,6 +334,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic # add an index counter for schedulers that allow duplicated timesteps self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample @@ -813,11 +833,12 @@ def singlestep_dpm_solver_update( else: raise ValueError(f"Order must be 1, 2, 3, got {order}") - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + index_candidates = (schedule_timesteps == timestep).nonzero() if len(index_candidates) == 0: step_index = len(self.timesteps) - 1 @@ -830,7 +851,20 @@ def _init_step_index(self, timestep): else: step_index = index_candidates[0].item() - self._step_index = step_index + return step_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index + def _init_step_index(self, timestep): + """ + Initialize the step_index counter for the scheduler. + """ + + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -925,16 +959,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [] - for timestep in timesteps: - index_candidates = (schedule_timesteps == timestep).nonzero() - if len(index_candidates) == 0: - step_index = len(schedule_timesteps) - 1 - elif len(index_candidates) > 1: - step_index = index_candidates[1].item() - else: - step_index = index_candidates[0].item() - step_indices.append(step_index) + # begin_index is None when the scheduler is used for training + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index acad67847237..35fb22c9fdab 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -216,6 +216,7 @@ def __init__( self.is_scale_input_called = False self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -233,6 +234,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -300,25 +319,32 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device=device) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() - self._step_index = step_index.item() + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -440,7 +466,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 6ed28f410aea..c5e858e545be 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -237,6 +237,7 @@ def __init__( self.use_karras_sigmas = use_karras_sigmas self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -255,6 +256,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -342,6 +361,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication def _sigma_to_t(self, sigma, log_sigmas): @@ -393,22 +413,27 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 - self._step_index = step_index.item() + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -538,7 +563,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index a1ea18dcf168..b1877bae4727 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -13,7 +13,6 @@ # limitations under the License. import math -from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -148,8 +147,10 @@ def __init__( self.use_karras_sigmas = use_karras_sigmas self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: schedule_timesteps = self.timesteps @@ -160,11 +161,7 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(self._index_counter) == 0: - pos = 1 if len(indices) > 1 else 0 - else: - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - pos = self._index_counter[timestep_int] + pos = 1 if len(indices) > 1 else 0 return indices[pos].item() @@ -183,6 +180,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, @@ -270,13 +285,9 @@ def set_timesteps( self.dt = None self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # (YiYi Notes: keep this for now since we are keeping add_noise function which use index_for_timestep) - # for exp beta schedules, such as the one for `pipeline_shap_e.py` - # we need an index counter - self._index_counter = defaultdict(int) - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t def _sigma_to_t(self, sigma, log_sigmas): # get log sigma @@ -333,21 +344,12 @@ def state_in_first_order(self): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) - - index_candidates = (self.timesteps == timestep).nonzero() - - # The sigma index that is taken for the **very** first `step` - # is always the second index (or the last index if there is only 1) - # This way we can ensure we don't accidentally skip a sigma in - # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) else: - step_index = index_candidates[0] - - self._step_index = step_index.item() + self._step_index = self._begin_index def step( self, @@ -378,11 +380,6 @@ def step( if self.step_index is None: self._init_step_index(timestep) - # (YiYi notes: keep this for now since we are keeping the add_noise method) - # advance index counter by 1 - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - self._index_counter[timestep_int] += 1 - if self.state_in_first_order: sigma = self.sigmas[self.step_index] sigma_next = self.sigmas[self.step_index + 1] @@ -453,6 +450,7 @@ def step( return SchedulerOutput(prev_sample=prev_sample) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, @@ -469,7 +467,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py index aeebd029a441..4025bad1a327 100644 --- a/src/diffusers/schedulers/scheduling_ipndm.py +++ b/src/diffusers/schedulers/scheduling_ipndm.py @@ -56,6 +56,7 @@ def __init__( # running values self.ets = [] self._step_index = None + self._begin_index = None @property def step_index(self): @@ -64,6 +65,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -90,24 +109,31 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.ets = [] self._step_index = None + self._begin_index = None - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 - self._step_index = step_index.item() + return indices[pos].item() + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index 4a1cdb561cea..5c1934c1b077 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -13,7 +13,6 @@ # limitations under the License. import math -from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -140,27 +139,9 @@ def __init__( # set all values self.set_timesteps(num_train_timesteps, None, num_train_timesteps) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep - def index_for_timestep(self, timestep, schedule_timesteps=None): - if schedule_timesteps is None: - schedule_timesteps = self.timesteps - - indices = (schedule_timesteps == timestep).nonzero() - - # The sigma index that is taken for the **very** first `step` - # is always the second index (or the last index if there is only 1) - # This way we can ensure we don't accidentally skip a sigma in - # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(self._index_counter) == 0: - pos = 1 if len(indices) > 1 else 0 - else: - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - pos = self._index_counter[timestep_int] - - return indices[pos].item() - @property def init_noise_sigma(self): # standard deviation of the initial noise distribution @@ -176,6 +157,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, @@ -295,11 +294,8 @@ def set_timesteps( self.sample = None - # for exp beta schedules, such as the one for `pipeline_shap_e.py` - # we need an index counter - self._index_counter = defaultdict(int) - self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t @@ -356,23 +352,29 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def state_in_first_order(self): return self.sample is None - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 - self._step_index = step_index.item() + return indices[pos].item() + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -406,10 +408,6 @@ def step( if self.step_index is None: self._init_step_index(timestep) - # advance index counter by 1 - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - self._index_counter[timestep_int] += 1 - if self.state_in_first_order: sigma = self.sigmas[self.step_index] sigma_interpol = self.sigmas_interpol[self.step_index] @@ -478,7 +476,7 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.add_noise + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, @@ -495,7 +493,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index 57062c0d3586..7c800e4e68b2 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -13,7 +13,6 @@ # limitations under the License. import math -from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -140,27 +139,9 @@ def __init__( self.set_timesteps(num_train_timesteps, None, num_train_timesteps) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep - def index_for_timestep(self, timestep, schedule_timesteps=None): - if schedule_timesteps is None: - schedule_timesteps = self.timesteps - - indices = (schedule_timesteps == timestep).nonzero() - - # The sigma index that is taken for the **very** first `step` - # is always the second index (or the last index if there is only 1) - # This way we can ensure we don't accidentally skip a sigma in - # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(self._index_counter) == 0: - pos = 1 if len(indices) > 1 else 0 - else: - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - pos = self._index_counter[timestep_int] - - return indices[pos].item() - @property def init_noise_sigma(self): # standard deviation of the initial noise distribution @@ -176,6 +157,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, @@ -280,34 +279,37 @@ def set_timesteps( self.sample = None - # for exp beta schedules, such as the one for `pipeline_shap_e.py` - # we need an index counter - self._index_counter = defaultdict(int) - self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property def state_in_first_order(self): return self.sample is None - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 - self._step_index = step_index.item() + return indices[pos].item() + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t def _sigma_to_t(self, sigma, log_sigmas): @@ -388,10 +390,6 @@ def step( if self.step_index is None: self._init_step_index(timestep) - # advance index counter by 1 - timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep - self._index_counter[timestep_int] += 1 - if self.state_in_first_order: sigma = self.sigmas[self.step_index] sigma_interpol = self.sigmas_interpol[self.step_index + 1] @@ -453,7 +451,7 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.add_noise + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, @@ -470,7 +468,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py index a54f78423d73..1156c2634e31 100644 --- a/src/diffusers/schedulers/scheduling_lcm.py +++ b/src/diffusers/schedulers/scheduling_lcm.py @@ -250,29 +250,54 @@ def __init__( self.custom_timesteps = False self._step_index = None + self._begin_index = None - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() - self._step_index = step_index.item() + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index @property def step_index(self): return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the @@ -462,6 +487,7 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long) self._step_index = None + self._begin_index = None def get_scalings_for_boundary_condition_discrete(self, timestep): self.sigma_data = 0.5 # Default: 0.5 diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index f5f52b06bd43..02f78014d1f7 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -168,6 +168,7 @@ def __init__( self.is_scale_input_called = False self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -185,6 +186,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -280,27 +299,34 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = torch.from_numpy(sigmas).to(device=device) self.timesteps = torch.from_numpy(timesteps).to(device=device) self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication self.derivatives = [] - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + indices = (schedule_timesteps == timestep).nonzero() # The sigma index that is taken for the **very** first `step` # is always the second index (or the last index if there is only 1) # This way we can ensure we don't accidentally skip a sigma in # case we start in the middle of the denoising schedule (e.g. for image-to-image) - if len(index_candidates) > 1: - step_index = index_candidates[1] - else: - step_index = index_candidates[0] + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() - self._step_index = step_index.item() + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t def _sigma_to_t(self, sigma, log_sigmas): @@ -434,7 +460,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index 13e3c76cf5b4..6a07cd082a47 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -212,6 +212,7 @@ def __init__( self.lower_order_nums = 0 self.last_sample = None self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -221,6 +222,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -283,6 +302,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # add an index counter for schedulers that allow duplicated timesteps self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample @@ -925,11 +945,12 @@ def stochastic_adams_moulton_update( x_t = x_t.to(x.dtype) return x_t - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + index_candidates = (schedule_timesteps == timestep).nonzero() if len(index_candidates) == 0: step_index = len(self.timesteps) - 1 @@ -942,7 +963,20 @@ def _init_step_index(self, timestep): else: step_index = index_candidates[0].item() - self._step_index = step_index + return step_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index + def _init_step_index(self, timestep): + """ + Initialize the step_index counter for the scheduler. + """ + + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 1223213c69f3..e556093ee91b 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -198,6 +198,7 @@ def __init__( self.solver_p = solver_p self.last_sample = None self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication @property @@ -207,6 +208,24 @@ def step_index(self): """ return self._step_index + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -269,6 +288,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic # add an index counter for schedulers that allow duplicated timesteps self._step_index = None + self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample @@ -698,11 +718,12 @@ def multistep_uni_c_bh_update( x_t = x_t.to(x.dtype) return x_t - def _init_step_index(self, timestep): - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps - index_candidates = (self.timesteps == timestep).nonzero() + index_candidates = (schedule_timesteps == timestep).nonzero() if len(index_candidates) == 0: step_index = len(self.timesteps) - 1 @@ -715,7 +736,20 @@ def _init_step_index(self, timestep): else: step_index = index_candidates[0].item() - self._step_index = step_index + return step_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index + def _init_step_index(self, timestep): + """ + Initialize the step_index counter for the scheduler. + """ + + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index def step( self, @@ -830,16 +864,11 @@ def add_noise( schedule_timesteps = self.timesteps.to(original_samples.device) timesteps = timesteps.to(original_samples.device) - step_indices = [] - for timestep in timesteps: - index_candidates = (schedule_timesteps == timestep).nonzero() - if len(index_candidates) == 0: - step_index = len(schedule_timesteps) - 1 - elif len(index_candidates) > 1: - step_index = index_candidates[1].item() - else: - step_index = index_candidates[0].item() - step_indices.append(step_index) + # begin_index is None when the scheduler is used for training + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + else: + step_indices = [self.begin_index] * timesteps.shape[0] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): From 9cc59ba0891c3ccbebfcbb6784f4dde2d5bd8556 Mon Sep 17 00:00:00 2001 From: UmerHA <40663591+UmerHA@users.noreply.github.com> Date: Fri, 2 Feb 2024 16:29:00 +0100 Subject: [PATCH 02/43] [Contributor Experience] Fix test collection on MPS (#6808) * Update testing_utils.py * Update testing_utils.py --- src/diffusers/utils/testing_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 86e31eb688cd..edbf6f31a833 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -854,6 +854,8 @@ def _is_torch_fp64_available(device): import torch + device = torch.device(device) + try: x = torch.zeros((2, 2), dtype=torch.float64).to(device) _ = torch.mul(x, x) From 15ed53d27227c215ebd3e36fab7b537f23b4b105 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sat, 3 Feb 2024 05:16:32 +0100 Subject: [PATCH 03/43] Fixes LoRA SDXL training script with DDP + PEFT (#6816) Update train_dreambooth_lora_sdxl.py --- examples/dreambooth/train_dreambooth_lora_sdxl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index a995eb3043dc..2cc2ab79db95 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -1399,8 +1399,8 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): text_encoder_two.train() # set top parameter requires_grad = True for gradient checkpointing works - text_encoder_one.text_model.embeddings.requires_grad_(True) - text_encoder_two.text_model.embeddings.requires_grad_(True) + accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True) + accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True) for step, batch in enumerate(train_dataloader): with accelerator.accumulate(unet): From 02338c93178461f6dfbc9e82f352d03277812f82 Mon Sep 17 00:00:00 2001 From: Stephen Date: Sat, 3 Feb 2024 02:14:13 -0500 Subject: [PATCH 04/43] Change path to posix (testing_utils.py) (#6803) change path to pathlib as_posix Co-authored-by: Sayak Paul --- src/diffusers/utils/testing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index edbf6f31a833..02ecae22aafe 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -522,7 +522,7 @@ def load_hf_numpy(path) -> np.ndarray: base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main" if not path.startswith("http://") and not path.startswith("https://"): - path = os.path.join(base_url, urllib.parse.quote(path)) + path = Path(base_url, urllib.parse.quote(path)).as_posix() return load_numpy(path) From 65329aed985b9d342b34f0d649693a61909fbf4f Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Sat, 3 Feb 2024 17:33:43 +0200 Subject: [PATCH 05/43] [advanced dreambooth lora sdxl script] new features + bug fixes (#6691) * add noise_offset param * micro conditioning - wip * image processing adjusted and moved to support micro conditioning * change time ids to be computed inside train loop * change time ids to be computed inside train loop * change time ids to be computed inside train loop * time ids shape fix * move token replacement of validation prompt to the same section of instance prompt and class prompt * add offset noise to sd15 advanced script * fix token loading during validation * fix token loading during validation in sdxl script * a little clean * style * a little clean * style * sdxl script - a little clean + minor path fix sd 1.5 script - change default resolution value * ad 1.5 script - minor path fix * fix missing comma in code example in model card * clean up commented lines * style * remove time ids computed outside training loop - no longer used now that we utilize micro-conditioning, as all time ids are now computed inside the training loop * style * [WIP] - added draft readme, building off of examples/dreambooth/README.md * readme * readme * readme * readme * readme * readme * readme * readme * removed --crops_coords_top_left from CLI args * style * fix missing shape bug due to missing RGB if statement * add blog mention at the start of the reamde as well * Update examples/advanced_diffusion_training/README.md Co-authored-by: Sayak Paul * change note to render nicely as well --------- Co-authored-by: Sayak Paul --- .../advanced_diffusion_training/README.md | 244 ++++++++++++++++++ .../requirements.txt | 7 + .../train_dreambooth_lora_sd15_advanced.py | 37 ++- .../train_dreambooth_lora_sdxl_advanced.py | 153 +++++++---- 4 files changed, 386 insertions(+), 55 deletions(-) create mode 100644 examples/advanced_diffusion_training/README.md create mode 100644 examples/advanced_diffusion_training/requirements.txt diff --git a/examples/advanced_diffusion_training/README.md b/examples/advanced_diffusion_training/README.md new file mode 100644 index 000000000000..0a49284543d2 --- /dev/null +++ b/examples/advanced_diffusion_training/README.md @@ -0,0 +1,244 @@ +# Advanced diffusion training examples + +## Train Dreambooth LoRA with Stable Diffusion XL +> [!TIP] +> 💡 This example follows the techniques and recommended practices covered in the blog post: [LoRA training scripts of the world, unite!](https://huggingface.co/blog/sdxl_lora_advanced_script). Make sure to check it out before starting 🤗 + +[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. + +LoRA - Low-Rank Adaption of Large Language Models, was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen* +In a nutshell, LoRA allows to adapt pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages: +- Previous pretrained weights are kept frozen so that the model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114) +- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable. +- LoRA attention layers allow to control to which extent the model is adapted towards new training images via a `scale` parameter. +[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in +the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository. + +The `train_dreambooth_lora_sdxl_advanced.py` script shows how to implement dreambooth-LoRA, combining the training process shown in `train_dreambooth_lora_sdxl.py`, with +advanced features and techniques, inspired and built upon contributions by [Nataniel Ruiz](https://twitter.com/natanielruizg): [Dreambooth](https://dreambooth.github.io), [Rinon Gal](https://twitter.com/RinonGal): [Textual Inversion](https://textual-inversion.github.io), [Ron Mokady](https://twitter.com/MokadyRon): [Pivotal Tuning](https://arxiv.org/abs/2106.05744), [Simo Ryu](https://twitter.com/cloneofsimo): [cog-sdxl](https://github.com/replicate/cog-sdxl), +[Kohya](https://twitter.com/kohya_tech/): [sd-scripts](https://github.com/kohya-ss/sd-scripts), [The Last Ben](https://twitter.com/__TheBen): [fast-stable-diffusion](https://github.com/TheLastBen/fast-stable-diffusion) ❤️ + +> [!NOTE] +> 💡If this is your first time training a Dreambooth LoRA, congrats!🥳 +> You might want to familiarize yourself more with the techniques: [Dreambooth blog](https://huggingface.co/blog/dreambooth), [Using LoRA for Efficient Stable Diffusion Fine-Tuning blog](https://huggingface.co/blog/lora) + +📚 Read more about the advanced features and best practices in this community derived blog post: [LoRA training scripts of the world, unite!](https://huggingface.co/blog/sdxl_lora_advanced_script) + + +## Running locally with PyTorch + +### Installing the dependencies + +Before running the scripts, make sure to install the library's training dependencies: + +**Important** + +To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment: +```bash +git clone https://github.com/huggingface/diffusers +cd diffusers +pip install -e . +``` + +Then cd in the `examples/advanced_diffusion_training` folder and run +```bash +pip install -r requirements.txt +``` + +And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with: + +```bash +accelerate config +``` + +Or for a default accelerate configuration without answering questions about your environment + +```bash +accelerate config default +``` + +Or if your environment doesn't support an interactive shell e.g. a notebook + +```python +from accelerate.utils import write_basic_config +write_basic_config() +``` + +When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. +Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment. + +### Pivotal Tuning +**Training with text encoder(s)** + +Alongside the UNet, LoRA fine-tuning of the text encoders is also supported. In addition to the text encoder optimization +available with `train_dreambooth_lora_sdxl_advanced.py`, in the advanced script **pivotal tuning** is also supported. +[pivotal tuning](https://huggingface.co/blog/sdxl_lora_advanced_script#pivotal-tuning) combines Textual Inversion with regular diffusion fine-tuning - +we insert new tokens into the text encoders of the model, instead of reusing existing ones. +We then optimize the newly-inserted token embeddings to represent the new concept. + +To do so, just specify `--train_text_encoder_ti` while launching training (for regular text encoder optimizations, use `--train_text_encoder`). +Please keep the following points in mind: + +* SDXL has two text encoders. So, we fine-tune both using LoRA. +* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memoםהקרry. + + +### 3D icon example + +Now let's get our dataset. For this example we will use some cool images of 3d rendered icons: https://huggingface.co/datasets/linoyts/3d_icon. + +Let's first download it locally: + +```python +from huggingface_hub import snapshot_download + +local_dir = "./3d_icon" +snapshot_download( + "LinoyTsaban/3d_icon", + local_dir=local_dir, repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` + +Let's review some of the advanced features we're going to be using for this example: +- **custom captions**: +To use custom captioning, first ensure that you have the datasets library installed, otherwise you can install it by +```bash +pip install datasets +``` + +Now we'll simply specify the name of the dataset and caption column (in this case it's "prompt") + +``` +--dataset_name=./3d_icon +--caption_column=prompt +``` + +You can also load a dataset straight from by specifying it's name in `dataset_name`. +Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset. + +- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer +- **pivotal tuning** +- **min SNR gamma** + +**Now, we can launch training:** + +```bash +export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" +export DATASET_NAME="./3d_icon" +export OUTPUT_DIR="3d-icon-SDXL-LoRA" +export VAE_PATH="madebyollin/sdxl-vae-fp16-fix" + +accelerate launch train_dreambooth_lora_sdxl_advanced.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --pretrained_vae_model_name_or_path=$VAE_PATH \ + --dataset_name=$DATASET_NAME \ + --instance_prompt="3d icon in the style of TOK" \ + --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \ + --output_dir=$OUTPUT_DIR \ + --caption_column="prompt" \ + --mixed_precision="bf16" \ + --resolution=1024 \ + --train_batch_size=3 \ + --repeats=1 \ + --report_to="wandb"\ + --gradient_accumulation_steps=1 \ + --gradient_checkpointing \ + --learning_rate=1.0 \ + --text_encoder_lr=1.0 \ + --optimizer="prodigy"\ + --train_text_encoder_ti\ + --train_text_encoder_ti_frac=0.5\ + --snr_gamma=5.0 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --rank=8 \ + --max_train_steps=1000 \ + --checkpointing_steps=2000 \ + --seed="0" \ + --push_to_hub +``` + +To better track our training experiments, we're using the following flags in the command above: + +* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`. +* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. + +Our experiments were conducted on a single 40GB A100 GPU. + + +### Inference + +Once training is done, we can perform inference like so: +1. starting with loading the unet lora weights +```python +import torch +from huggingface_hub import hf_hub_download, upload_file +from diffusers import DiffusionPipeline +from diffusers.models import AutoencoderKL +from safetensors.torch import load_file + +username = "linoyts" +repo_id = f"{username}/3d-icon-SDXL-LoRA" + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + + +pipe.load_lora_weights(repo_id, weight_name="pytorch_lora_weights.safetensors") +``` +2. now we load the pivotal tuning embeddings + +```python +text_encoders = [pipe.text_encoder, pipe.text_encoder_2] +tokenizers = [pipe.tokenizer, pipe.tokenizer_2] + +embedding_path = hf_hub_download(repo_id=repo_id, filename="3d-icon-SDXL-LoRA_emb.safetensors", repo_type="model") + +state_dict = load_file(embedding_path) +# load embeddings of text_encoder 1 (CLIP ViT-L/14) +pipe.load_textual_inversion(state_dict["clip_l"], token=["", ""], text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer) +# load embeddings of text_encoder 2 (CLIP ViT-G/14) +pipe.load_textual_inversion(state_dict["clip_g"], token=["", ""], text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2) +``` + +3. let's generate images + +```python +instance_token = "" +prompt = f"a {instance_token} icon of an orange llama eating ramen, in the style of {instance_token}" + +image = pipe(prompt=prompt, num_inference_steps=25, cross_attention_kwargs={"scale": 1.0}).images[0] +image.save("llama.png") +``` + +### Comfy UI / AUTOMATIC1111 Inference +The new script fully supports textual inversion loading with Comfy UI and AUTOMATIC1111 formats! + +**AUTOMATIC1111 / SD.Next** \ +In AUTOMATIC1111/SD.Next we will load a LoRA and a textual embedding at the same time. +- *LoRA*: Besides the diffusers format, the script will also train a WebUI compatible LoRA. It is generated as `{your_lora_name}.safetensors`. You can then include it in your `models/Lora` directory. +- *Embedding*: the embedding is the same for diffusers and WebUI. You can download your `{lora_name}_emb.safetensors` file from a trained model, and include it in your `embeddings` directory. + +You can then run inference by prompting `a y2k_emb webpage about the movie Mean Girls `. You can use the `y2k_emb` token normally, including increasing its weight by doing `(y2k_emb:1.2)`. + +**ComfyUI** \ +In ComfyUI we will load a LoRA and a textual embedding at the same time. +- *LoRA*: Besides the diffusers format, the script will also train a ComfyUI compatible LoRA. It is generated as `{your_lora_name}.safetensors`. You can then include it in your `models/Lora` directory. Then you will load the LoRALoader node and hook that up with your model and CLIP. [Official guide for loading LoRAs](https://comfyanonymous.github.io/ComfyUI_examples/lora/) +- *Embedding*: the embedding is the same for diffusers and WebUI. You can download your `{lora_name}_emb.safetensors` file from a trained model, and include it in your `models/embeddings` directory and use it in your prompts like `embedding:y2k_emb`. [Official guide for loading embeddings](https://comfyanonymous.github.io/ComfyUI_examples/textual_inversion_embeddings/). +- +### Specifying a better VAE + +SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)). + + +### Tips and Tricks +Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices) + +## Running on Colab Notebook +Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_advanced_example.ipynb). +to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning) + diff --git a/examples/advanced_diffusion_training/requirements.txt b/examples/advanced_diffusion_training/requirements.txt new file mode 100644 index 000000000000..3f86855e1d1e --- /dev/null +++ b/examples/advanced_diffusion_training/requirements.txt @@ -0,0 +1,7 @@ +accelerate>=0.16.0 +torchvision +transformers>=4.25.1 +ftfy +tensorboard +Jinja2 +peft==0.7.0 \ No newline at end of file diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index 385144b133a6..3f660c5a3f4f 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -119,10 +119,9 @@ def save_model_card( diffusers_imports_pivotal = """from huggingface_hub import hf_hub_download from safetensors.torch import load_file """ - diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors' repo_type="model") + diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors', repo_type="model") state_dict = load_file(embedding_path) pipeline.load_textual_inversion(state_dict["clip_l"], token=[{ti_keys}], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) -pipeline.load_textual_inversion(state_dict["clip_g"], token=[{ti_keys}], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) """ webui_example_pivotal = f"""- *Embeddings*: download **[`{embeddings_filename}.safetensors` here 💾](/{repo_id}/blob/main/{embeddings_filename}.safetensors)**. - Place it on it on your `embeddings` folder @@ -389,7 +388,7 @@ def parse_args(input_args=None): parser.add_argument( "--resolution", type=int, - default=1024, + default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" " resolution" @@ -645,6 +644,7 @@ def parse_args(input_args=None): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") parser.add_argument( "--rank", type=int, @@ -745,10 +745,11 @@ def initialize_new_tokens(self, inserting_toks: List[str]): idx += 1 + # copied from train_dreambooth_lora_sdxl_advanced.py def save_embeddings(self, file_path: str): assert self.train_ids is not None, "Initialize new tokens before saving embeddings." tensors = {} - # text_encoder_0 - CLIP ViT-L/14, text_encoder_1 - CLIP ViT-G/14 + # text_encoder_0 - CLIP ViT-L/14, text_encoder_1 - CLIP ViT-G/14 - TODO - change for sd idx_to_text_encoder_name = {0: "clip_l", 1: "clip_g"} for idx, text_encoder in enumerate(self.text_encoders): assert text_encoder.text_model.embeddings.token_embedding.weight.data.shape[0] == len( @@ -1634,6 +1635,11 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # Sample noise that we'll add to the latents noise = torch.randn_like(model_input) + if args.noise_offset: + # https://www.crosslabs.org//blog/diffusion-with-offset-noise + noise += args.noise_offset * torch.randn( + (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device + ) bsz = model_input.shape[0] # Sample a random timestep for each image timesteps = torch.randint( @@ -1788,6 +1794,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): pipeline = StableDiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, vae=vae, + tokenizer=tokenizer_one, text_encoder=accelerator.unwrap_model(text_encoder_one), unet=accelerator.unwrap_model(unet), revision=args.revision, @@ -1860,6 +1867,11 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): unet_lora_layers=unet_lora_layers, text_encoder_lora_layers=text_encoder_lora_layers, ) + + if args.train_text_encoder_ti: + embeddings_path = f"{args.output_dir}/{args.output_dir}_emb.safetensors" + embedding_handler.save_embeddings(embeddings_path) + images = [] if args.validation_prompt and args.num_validation_images > 0: # Final inference @@ -1895,6 +1907,18 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # load attention processors pipeline.load_lora_weights(args.output_dir) + # load new tokens + if args.train_text_encoder_ti: + state_dict = load_file(embeddings_path) + all_new_tokens = [] + for key, value in token_abstraction_dict.items(): + all_new_tokens.extend(value) + pipeline.load_textual_inversion( + state_dict["clip_l"], + token=all_new_tokens, + text_encoder=pipeline.text_encoder, + tokenizer=pipeline.tokenizer, + ) # run inference pipeline = pipeline.to(accelerator.device) generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None @@ -1917,11 +1941,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): } ) - if args.train_text_encoder_ti: - embedding_handler.save_embeddings( - f"{args.output_dir}/{args.output_dir}_emb.safetensors", - ) - # Conver to WebUI format lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors") peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index e35630e3e8af..6ae3d315f8ff 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -20,6 +20,7 @@ import logging import math import os +import random import re import shutil import warnings @@ -45,6 +46,7 @@ from safetensors.torch import load_file, save_file from torch.utils.data import Dataset from torchvision import transforms +from torchvision.transforms.functional import crop from tqdm.auto import tqdm from transformers import AutoTokenizer, PretrainedConfig @@ -121,7 +123,7 @@ def save_model_card( diffusers_imports_pivotal = """from huggingface_hub import hf_hub_download from safetensors.torch import load_file """ - diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors' repo_type="model") + diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename='{embeddings_filename}.safetensors', repo_type="model") state_dict = load_file(embedding_path) pipeline.load_textual_inversion(state_dict["clip_l"], token=[{ti_keys}], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) pipeline.load_textual_inversion(state_dict["clip_g"], token=[{ti_keys}], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) @@ -397,18 +399,6 @@ def parse_args(input_args=None): " resolution" ), ) - parser.add_argument( - "--crops_coords_top_left_h", - type=int, - default=0, - help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."), - ) - parser.add_argument( - "--crops_coords_top_left_w", - type=int, - default=0, - help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."), - ) parser.add_argument( "--center_crop", default=False, @@ -418,6 +408,11 @@ def parse_args(input_args=None): " cropped. The images will be resized to the resolution first before cropping." ), ) + parser.add_argument( + "--random_flip", + action="store_true", + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_text_encoder", action="store_true", @@ -659,6 +654,7 @@ def parse_args(input_args=None): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") parser.add_argument( "--rank", type=int, @@ -901,6 +897,41 @@ def __init__( self.instance_images = [] for img in instance_images: self.instance_images.extend(itertools.repeat(img, repeats)) + + # image processing to prepare for using SD-XL micro-conditioning + self.original_sizes = [] + self.crop_top_lefts = [] + self.pixel_values = [] + train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR) + train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size) + train_flip = transforms.RandomHorizontalFlip(p=1.0) + train_transforms = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + for image in self.instance_images: + image = exif_transpose(image) + if not image.mode == "RGB": + image = image.convert("RGB") + self.original_sizes.append((image.height, image.width)) + image = train_resize(image) + if args.random_flip and random.random() < 0.5: + # flip + image = train_flip(image) + if args.center_crop: + y1 = max(0, int(round((image.height - args.resolution) / 2.0))) + x1 = max(0, int(round((image.width - args.resolution) / 2.0))) + image = train_crop(image) + else: + y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution)) + image = crop(image, y1, x1, h, w) + crop_top_left = (y1, x1) + self.crop_top_lefts.append(crop_top_left) + image = train_transforms(image) + self.pixel_values.append(image) + self.num_instance_images = len(self.instance_images) self._length = self.num_instance_images @@ -930,12 +961,12 @@ def __len__(self): def __getitem__(self, index): example = {} - instance_image = self.instance_images[index % self.num_instance_images] - instance_image = exif_transpose(instance_image) - - if not instance_image.mode == "RGB": - instance_image = instance_image.convert("RGB") - example["instance_images"] = self.image_transforms(instance_image) + instance_image = self.pixel_values[index % self.num_instance_images] + original_size = self.original_sizes[index % self.num_instance_images] + crop_top_left = self.crop_top_lefts[index % self.num_instance_images] + example["instance_images"] = instance_image + example["original_size"] = original_size + example["crop_top_left"] = crop_top_left if self.custom_instance_prompts: caption = self.custom_instance_prompts[index % self.num_instance_images] @@ -966,6 +997,8 @@ def __getitem__(self, index): def collate_fn(examples, with_prior_preservation=False): pixel_values = [example["instance_images"] for example in examples] prompts = [example["instance_prompt"] for example in examples] + original_sizes = [example["original_size"] for example in examples] + crop_top_lefts = [example["crop_top_left"] for example in examples] # Concat class and instance examples for prior preservation. # We do this to avoid doing two forward passes. @@ -976,7 +1009,12 @@ def collate_fn(examples, with_prior_preservation=False): pixel_values = torch.stack(pixel_values) pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() - batch = {"pixel_values": pixel_values, "prompts": prompts} + batch = { + "pixel_values": pixel_values, + "prompts": prompts, + "original_sizes": original_sizes, + "crop_top_lefts": crop_top_lefts, + } return batch @@ -1198,7 +1236,9 @@ def main(args): args.instance_prompt = args.instance_prompt.replace(token_abs, "".join(token_replacement)) if args.with_prior_preservation: args.class_prompt = args.class_prompt.replace(token_abs, "".join(token_replacement)) - + if args.validation_prompt: + args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement)) + print("validation prompt:", args.validation_prompt) # initialize the new tokens for textual inversion embedding_handler = TokenEmbeddingsHandler( [text_encoder_one, text_encoder_two], [tokenizer_one, tokenizer_two] @@ -1539,11 +1579,11 @@ def load_model_hook(models, input_dir): # pooled text embeddings # time ids - def compute_time_ids(): + def compute_time_ids(crops_coords_top_left, original_size=None): # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids - original_size = (args.resolution, args.resolution) + if original_size is None: + original_size = (args.resolution, args.resolution) target_size = (args.resolution, args.resolution) - crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w) add_time_ids = list(original_size + crops_coords_top_left + target_size) add_time_ids = torch.tensor([add_time_ids]) add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype) @@ -1560,9 +1600,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device) return prompt_embeds, pooled_prompt_embeds - # Handle instance prompt. - instance_time_ids = compute_time_ids() - # If no type of tuning is done on the text_encoder and custom instance prompts are NOT # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid # the redundant encoding. @@ -1573,7 +1610,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # Handle class prompt for prior-preservation. if args.with_prior_preservation: - class_time_ids = compute_time_ids() if freeze_text_encoder: class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings( args.class_prompt, text_encoders, tokenizers @@ -1588,9 +1624,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images), # pack the statically computed variables appropriately here. This is so that we don't # have to pass them to the dataloader. - add_time_ids = instance_time_ids - if args.with_prior_preservation: - add_time_ids = torch.cat([add_time_ids, class_time_ids], dim=0) # if --train_text_encoder_ti we need add_special_tokens to be True fo textual inversion add_special_tokens = True if args.train_text_encoder_ti else False @@ -1613,12 +1646,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0) tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0) - if args.train_text_encoder_ti and args.validation_prompt: - # replace instances of --token_abstraction in validation prompt with the new tokens: "" etc. - for token_abs, token_replacement in train_dataset.token_abstraction_dict.items(): - args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement)) - print("validation prompt:", args.validation_prompt) - if args.cache_latents: latents_cache = [] for batch in tqdm(train_dataloader, desc="Caching latents"): @@ -1778,6 +1805,12 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # Sample noise that we'll add to the latents noise = torch.randn_like(model_input) + if args.noise_offset: + # https://www.crosslabs.org//blog/diffusion-with-offset-noise + noise += args.noise_offset * torch.randn( + (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device + ) + bsz = model_input.shape[0] # Sample a random timestep for each image timesteps = torch.randint( @@ -1789,19 +1822,26 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # (this is the forward diffusion process) noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps) + # time ids + add_time_ids = torch.cat( + [ + compute_time_ids(original_size=s, crops_coords_top_left=c) + for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"]) + ] + ) + # Calculate the elements to repeat depending on the use of prior-preservation and custom captions. if not train_dataset.custom_instance_prompts: elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz - elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz else: elems_to_repeat_text_embeds = 1 - elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz # Predict the noise residual if freeze_text_encoder: unet_added_conditions = { - "time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1), + "time_ids": add_time_ids, + # "time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1), "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1), } prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1) @@ -1812,7 +1852,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): added_cond_kwargs=unet_added_conditions, ).sample else: - unet_added_conditions = {"time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1)} + unet_added_conditions = {"time_ids": add_time_ids} prompt_embeds, pooled_prompt_embeds = encode_prompt( text_encoders=[text_encoder_one, text_encoder_two], tokenizers=None, @@ -1954,6 +1994,8 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): pipeline = StableDiffusionXLPipeline.from_pretrained( args.pretrained_model_name_or_path, vae=vae, + tokenizer=tokenizer_one, + tokenizer_2=tokenizer_two, text_encoder=accelerator.unwrap_model(text_encoder_one), text_encoder_2=accelerator.unwrap_model(text_encoder_two), unet=accelerator.unwrap_model(unet), @@ -2033,6 +2075,11 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): text_encoder_lora_layers=text_encoder_lora_layers, text_encoder_2_lora_layers=text_encoder_2_lora_layers, ) + + if args.train_text_encoder_ti: + embeddings_path = f"{args.output_dir}/{args.output_dir}_emb.safetensors" + embedding_handler.save_embeddings(embeddings_path) + images = [] if args.validation_prompt and args.num_validation_images > 0: # Final inference @@ -2068,6 +2115,25 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # load attention processors pipeline.load_lora_weights(args.output_dir) + # load new tokens + if args.train_text_encoder_ti: + state_dict = load_file(embeddings_path) + all_new_tokens = [] + for key, value in token_abstraction_dict.items(): + all_new_tokens.extend(value) + pipeline.load_textual_inversion( + state_dict["clip_l"], + token=all_new_tokens, + text_encoder=pipeline.text_encoder, + tokenizer=pipeline.tokenizer, + ) + pipeline.load_textual_inversion( + state_dict["clip_g"], + token=all_new_tokens, + text_encoder=pipeline.text_encoder_2, + tokenizer=pipeline.tokenizer_2, + ) + # run inference pipeline = pipeline.to(accelerator.device) generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None @@ -2090,11 +2156,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): } ) - if args.train_text_encoder_ti: - embedding_handler.save_embeddings( - f"{args.output_dir}/{args.output_dir}_emb.safetensors", - ) - # Conver to WebUI format lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors") peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict) From 13001ee315171c4f9e2fec4ed7862e19fa453a5e Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Sat, 3 Feb 2024 19:56:55 +0100 Subject: [PATCH 06/43] Bugfix in IPAdapterFaceID (#6835) --- examples/community/ip_adapter_face_id.py | 81 +++++++++++------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index d9325742cf49..531efabcd52c 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -104,6 +104,22 @@ def __call__( ): residual = hidden_states + # separate ip_hidden_states from encoder_hidden_states + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, tuple): + encoder_hidden_states, ip_hidden_states = encoder_hidden_states + else: + deprecation_message = ( + "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release." + " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning." + ) + deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False) + end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0] + encoder_hidden_states, ip_hidden_states = ( + encoder_hidden_states[:, :end_pos, :], + [encoder_hidden_states[:, end_pos:, :]], + ) + if attn.spatial_norm is not None: hidden_states = attn.spatial_norm(hidden_states, temb) @@ -125,15 +141,8 @@ def __call__( if encoder_hidden_states is None: encoder_hidden_states = hidden_states - else: - # get encoder_hidden_states, ip_hidden_states - end_pos = encoder_hidden_states.shape[1] - self.num_tokens - encoder_hidden_states, ip_hidden_states = ( - encoder_hidden_states[:, :end_pos, :], - encoder_hidden_states[:, end_pos:, :], - ) - if attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states) @@ -233,6 +242,22 @@ def __call__( ): residual = hidden_states + # separate ip_hidden_states from encoder_hidden_states + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, tuple): + encoder_hidden_states, ip_hidden_states = encoder_hidden_states + else: + deprecation_message = ( + "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release." + " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning." + ) + deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False) + end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0] + encoder_hidden_states, ip_hidden_states = ( + encoder_hidden_states[:, :end_pos, :], + [encoder_hidden_states[:, end_pos:, :]], + ) + if attn.spatial_norm is not None: hidden_states = attn.spatial_norm(hidden_states, temb) @@ -259,15 +284,8 @@ def __call__( if encoder_hidden_states is None: encoder_hidden_states = hidden_states - else: - # get encoder_hidden_states, ip_hidden_states - end_pos = encoder_hidden_states.shape[1] - self.num_tokens - encoder_hidden_states, ip_hidden_states = ( - encoder_hidden_states[:, :end_pos, :], - encoder_hidden_states[:, end_pos:, :], - ) - if attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states) @@ -951,30 +969,6 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds - def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): - dtype = next(self.image_encoder.parameters()).dtype - - if not isinstance(image, torch.Tensor): - image = self.feature_extractor(image, return_tensors="pt").pixel_values - - image = image.to(device=device, dtype=dtype) - if output_hidden_states: - image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] - image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_enc_hidden_states = self.image_encoder( - torch.zeros_like(image), output_hidden_states=True - ).hidden_states[-2] - uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( - num_images_per_prompt, dim=0 - ) - return image_enc_hidden_states, uncond_image_enc_hidden_states - else: - image_embeds = self.image_encoder(image).image_embeds - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_embeds = torch.zeros_like(image_embeds) - - return image_embeds, uncond_image_embeds - def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: has_nsfw_concept = None @@ -1302,7 +1296,6 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. image_embeds (`torch.FloatTensor`, *optional*): Pre-generated image embeddings. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1411,7 +1404,7 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) if image_embeds is not None: - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0).to( + image_embeds = torch.stack([image_embeds] * num_images_per_prompt, dim=0).to( device=device, dtype=prompt_embeds.dtype ) negative_image_embeds = torch.zeros_like(image_embeds) From fbdf26bac57502cbe4372d2c41b3a581190590e2 Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Sun, 4 Feb 2024 16:00:09 +0200 Subject: [PATCH 07/43] [dreambooth lora sdxl] add sdxl micro conditioning (#6795) * add micro conditioning * remove redundant lines * style * fix missing 's' * fix missing shape bug due to missing RGB if statement * remove redundant if, change arg order --------- Co-authored-by: Sayak Paul --- .../dreambooth/train_dreambooth_lora_sdxl.py | 100 ++++++++++++------ 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index 2cc2ab79db95..aa09bf9a0ebf 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -19,6 +19,7 @@ import logging import math import os +import random import shutil import warnings from pathlib import Path @@ -40,6 +41,7 @@ from PIL.ImageOps import exif_transpose from torch.utils.data import Dataset from torchvision import transforms +from torchvision.transforms.functional import crop from tqdm.auto import tqdm from transformers import AutoTokenizer, PretrainedConfig @@ -304,18 +306,6 @@ def parse_args(input_args=None): " resolution" ), ) - parser.add_argument( - "--crops_coords_top_left_h", - type=int, - default=0, - help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."), - ) - parser.add_argument( - "--crops_coords_top_left_w", - type=int, - default=0, - help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."), - ) parser.add_argument( "--center_crop", default=False, @@ -325,6 +315,11 @@ def parse_args(input_args=None): " cropped. The images will be resized to the resolution first before cropping." ), ) + parser.add_argument( + "--random_flip", + action="store_true", + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_text_encoder", action="store_true", @@ -669,6 +664,41 @@ def __init__( self.instance_images = [] for img in instance_images: self.instance_images.extend(itertools.repeat(img, repeats)) + + # image processing to prepare for using SD-XL micro-conditioning + self.original_sizes = [] + self.crop_top_lefts = [] + self.pixel_values = [] + train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR) + train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size) + train_flip = transforms.RandomHorizontalFlip(p=1.0) + train_transforms = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + for image in self.instance_images: + image = exif_transpose(image) + if not image.mode == "RGB": + image = image.convert("RGB") + self.original_sizes.append((image.height, image.width)) + image = train_resize(image) + if args.random_flip and random.random() < 0.5: + # flip + image = train_flip(image) + if args.center_crop: + y1 = max(0, int(round((image.height - args.resolution) / 2.0))) + x1 = max(0, int(round((image.width - args.resolution) / 2.0))) + image = train_crop(image) + else: + y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution)) + image = crop(image, y1, x1, h, w) + crop_top_left = (y1, x1) + self.crop_top_lefts.append(crop_top_left) + image = train_transforms(image) + self.pixel_values.append(image) + self.num_instance_images = len(self.instance_images) self._length = self.num_instance_images @@ -698,12 +728,12 @@ def __len__(self): def __getitem__(self, index): example = {} - instance_image = self.instance_images[index % self.num_instance_images] - instance_image = exif_transpose(instance_image) - - if not instance_image.mode == "RGB": - instance_image = instance_image.convert("RGB") - example["instance_images"] = self.image_transforms(instance_image) + instance_image = self.pixel_values[index % self.num_instance_images] + original_size = self.original_sizes[index % self.num_instance_images] + crop_top_left = self.crop_top_lefts[index % self.num_instance_images] + example["instance_images"] = instance_image + example["original_size"] = original_size + example["crop_top_left"] = crop_top_left if self.custom_instance_prompts: caption = self.custom_instance_prompts[index % self.num_instance_images] @@ -730,6 +760,8 @@ def __getitem__(self, index): def collate_fn(examples, with_prior_preservation=False): pixel_values = [example["instance_images"] for example in examples] prompts = [example["instance_prompt"] for example in examples] + original_sizes = [example["original_size"] for example in examples] + crop_top_lefts = [example["crop_top_left"] for example in examples] # Concat class and instance examples for prior preservation. # We do this to avoid doing two forward passes. @@ -740,7 +772,12 @@ def collate_fn(examples, with_prior_preservation=False): pixel_values = torch.stack(pixel_values) pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() - batch = {"pixel_values": pixel_values, "prompts": prompts} + batch = { + "pixel_values": pixel_values, + "prompts": prompts, + "original_sizes": original_sizes, + "crop_top_lefts": crop_top_lefts, + } return batch @@ -1233,11 +1270,9 @@ def load_model_hook(models, input_dir): # pooled text embeddings # time ids - def compute_time_ids(): + def compute_time_ids(original_size, crops_coords_top_left): # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids - original_size = (args.resolution, args.resolution) target_size = (args.resolution, args.resolution) - crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w) add_time_ids = list(original_size + crops_coords_top_left + target_size) add_time_ids = torch.tensor([add_time_ids]) add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype) @@ -1254,9 +1289,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device) return prompt_embeds, pooled_prompt_embeds - # Handle instance prompt. - instance_time_ids = compute_time_ids() - # If no type of tuning is done on the text_encoder and custom instance prompts are NOT # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid # the redundant encoding. @@ -1267,7 +1299,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # Handle class prompt for prior-preservation. if args.with_prior_preservation: - class_time_ids = compute_time_ids() if not args.train_text_encoder: class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings( args.class_prompt, text_encoders, tokenizers @@ -1282,9 +1313,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images), # pack the statically computed variables appropriately here. This is so that we don't # have to pass them to the dataloader. - add_time_ids = instance_time_ids - if args.with_prior_preservation: - add_time_ids = torch.cat([add_time_ids, class_time_ids], dim=0) if not train_dataset.custom_instance_prompts: if not args.train_text_encoder: @@ -1436,18 +1464,24 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # (this is the forward diffusion process) noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps) + # time ids + add_time_ids = torch.cat( + [ + compute_time_ids(original_size=s, crops_coords_top_left=c) + for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"]) + ] + ) + # Calculate the elements to repeat depending on the use of prior-preservation and custom captions. if not train_dataset.custom_instance_prompts: elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz - elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz else: elems_to_repeat_text_embeds = 1 - elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz # Predict the noise residual if not args.train_text_encoder: unet_added_conditions = { - "time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1), + "time_ids": add_time_ids, "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1), } prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1) @@ -1459,7 +1493,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): return_dict=False, )[0] else: - unet_added_conditions = {"time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1)} + unet_added_conditions = {"time_ids": add_time_ids} prompt_embeds, pooled_prompt_embeds = encode_prompt( text_encoders=[text_encoder_one, text_encoder_two], tokenizers=None, From a5fc62f81957c739b1d4a8fd99bf551a2949dc3c Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 4 Feb 2024 11:16:44 -1000 Subject: [PATCH 08/43] add `self.use_ada_layer_norm_*` params back to `BasicTransformerBlock` (#6841) fix sd reference community ppeline Co-authored-by: yiyixuxu --- examples/community/stable_diffusion_reference.py | 6 ++++-- examples/community/stable_diffusion_xl_reference.py | 6 ++++-- src/diffusers/models/attention.py | 6 ++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index 88a7febae650..924548b35ca3 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -538,7 +538,7 @@ def hack_CrossAttnDownBlock2D_forward( return hidden_states, output_states - def hacked_DownBlock2D_forward(self, hidden_states, temb=None): + def hacked_DownBlock2D_forward(self, hidden_states, temb=None, **kwargs): eps = 1e-6 output_states = () @@ -634,7 +634,9 @@ def hacked_CrossAttnUpBlock2D_forward( return hidden_states - def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): + def hacked_UpBlock2D_forward( + self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, **kwargs + ): eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py index fbfb6bdd6160..4c7efa4b5f7a 100644 --- a/examples/community/stable_diffusion_xl_reference.py +++ b/examples/community/stable_diffusion_xl_reference.py @@ -507,7 +507,7 @@ def hack_CrossAttnDownBlock2D_forward( return hidden_states, output_states - def hacked_DownBlock2D_forward(self, hidden_states, temb=None): + def hacked_DownBlock2D_forward(self, hidden_states, temb=None, **kwargs): eps = 1e-6 output_states = () @@ -603,7 +603,9 @@ def hacked_CrossAttnUpBlock2D_forward( return hidden_states - def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): + def hacked_UpBlock2D_forward( + self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, **kwargs + ): eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index afb022c8d612..d4d611250ad0 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -158,6 +158,12 @@ def __init__( super().__init__() self.only_cross_attention = only_cross_attention + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + self.use_ada_layer_norm_single = norm_type == "ada_norm_single" + self.use_layer_norm = norm_type == "layer_norm" + self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous" + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: raise ValueError( f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" From f09ca909c819dbc58a0010a4ad909bff7471a59a Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 5 Feb 2024 07:24:38 +0530 Subject: [PATCH 09/43] Multiple small fixes to Video Pipeline docs (#6805) * update * update * update * Update src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py Co-authored-by: YiYi Xu * update * update --------- Co-authored-by: YiYi Xu --- docs/source/en/api/pipelines/i2vgenxl.md | 12 ++++++------ docs/source/en/api/pipelines/pia.md | 6 +++--- docs/source/en/api/pipelines/text_to_video.md | 10 +++++----- .../pipelines/animatediff/pipeline_output.py | 13 +++++++------ .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 14 ++++++++------ src/diffusers/pipelines/pia/pipeline_pia.py | 4 ++-- .../text_to_video_synthesis/pipeline_output.py | 14 ++++++++------ 7 files changed, 39 insertions(+), 34 deletions(-) diff --git a/docs/source/en/api/pipelines/i2vgenxl.md b/docs/source/en/api/pipelines/i2vgenxl.md index e5c8b50f0ad8..1d7eb5db16db 100644 --- a/docs/source/en/api/pipelines/i2vgenxl.md +++ b/docs/source/en/api/pipelines/i2vgenxl.md @@ -18,11 +18,11 @@ The abstract from the paper is: *Video synthesis has recently made remarkable strides benefiting from the rapid development of diffusion models. However, it still encounters challenges in terms of semantic accuracy, clarity and spatio-temporal continuity. They primarily arise from the scarcity of well-aligned text-video data and the complex inherent structure of videos, making it difficult for the model to simultaneously ensure semantic and qualitative excellence. In this report, we propose a cascaded I2VGen-XL approach that enhances model performance by decoupling these two factors and ensures the alignment of the input data by utilizing static images as a form of crucial guidance. I2VGen-XL consists of two stages: i) the base stage guarantees coherent semantics and preserves content from input images by using two hierarchical encoders, and ii) the refinement stage enhances the video's details by incorporating an additional brief text and improves the resolution to 1280×720. To improve the diversity, we collect around 35 million single-shot text-video pairs and 6 billion text-image pairs to optimize the model. By this means, I2VGen-XL can simultaneously enhance the semantic accuracy, continuity of details and clarity of generated videos. Through extensive experiments, we have investigated the underlying principles of I2VGen-XL and compared it with current top methods, which can demonstrate its effectiveness on diverse data. The source code and models will be publicly available at [this https URL](https://i2vgen-xl.github.io/).* -The original codebase can be found [here](https://github.com/ali-vilab/i2vgen-xl/). The model checkpoints can be found [here](https://huggingface.co/ali-vilab/). +The original codebase can be found [here](https://github.com/ali-vilab/i2vgen-xl/). The model checkpoints can be found [here](https://huggingface.co/ali-vilab/). -Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage). +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage). @@ -31,7 +31,7 @@ Sample output with I2VGenXL:
- masterpiece, bestquality, sunset. + library.
library
- masterpiece, bestquality, sunset. + cat in a field.
cat in a field
- masterpiece, bestquality, sunset. + cat in a field.
cat in a field>> import torch >>> from diffusers import I2VGenXLPipeline + >>> from diffusers.utils import export_to_gif, load_image >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") >>> pipeline.enable_model_cpu_offload() @@ -95,15 +96,16 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: @dataclass class I2VGenXLPipelineOutput(BaseOutput): r""" - Output class for image-to-video pipeline. + Output class for image-to-video pipeline. - Args: - frames (`List[np.ndarray]` or `torch.FloatTensor`) - List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as - a `torch` tensor. The length of the list denotes the video length (the number of frames). + Args: + frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape + `(batch_size, num_frames, channels, height, width)` """ - frames: Union[List[np.ndarray], torch.FloatTensor] + frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] class I2VGenXLPipeline(DiffusionPipeline): diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index fda56088b916..802081b52182 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -200,13 +200,13 @@ class PIAPipelineOutput(BaseOutput): Output class for PIAPipeline. Args: - frames (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]): + frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames, channels, height, width)`. """ - frames: Union[torch.Tensor, np.ndarray, PIL.Image.Image] + frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py index 411515809e6f..c155386cf173 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py @@ -2,6 +2,7 @@ from typing import List, Union import numpy as np +import PIL import torch from ...utils import ( @@ -12,12 +13,13 @@ @dataclass class TextToVideoSDPipelineOutput(BaseOutput): """ - Output class for text-to-video pipelines. + Output class for text-to-video pipelines. - Args: - frames (`List[np.ndarray]` or `torch.FloatTensor`) - List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as - a `torch` tensor. The length of the list denotes the video length (the number of frames). + Args: + frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape + `(batch_size, num_frames, channels, height, width)` """ - frames: Union[List[np.ndarray], torch.FloatTensor] + frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] From 64909f17b79dda63fdd8fcbe588dd8d5fc736bac Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 4 Feb 2024 15:56:46 -1000 Subject: [PATCH 10/43] update IP-adapter code in UNetMotionModel (#6828) fix Co-authored-by: yiyixuxu --- src/diffusers/models/unets/unet_motion_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index aa53d1dba93e..9cd3db126a14 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -792,6 +792,7 @@ def forward( emb = self.time_embedding(t_emb, timestep_cond) emb = emb.repeat_interleave(repeats=num_frames, dim=0) + encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0) if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj": if "image_embeds" not in added_cond_kwargs: @@ -799,10 +800,9 @@ def forward( f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" ) image_embeds = added_cond_kwargs.get("image_embeds") - image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype) - encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1) - - encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0) + image_embeds = self.encoder_hid_proj(image_embeds) + image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds] + encoder_hidden_states = (encoder_hidden_states, image_embeds) # 2. pre-process sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:]) From c6f8c310c36a7dd355f00c5d63ffa91e42198855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E5=92=A9Goat?= <38155995+BaaBaaGoat@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:34:01 +0800 Subject: [PATCH 11/43] Fix forward pass in UNetMotionModel when gradient checkpoint is enabled (#6744) fix #6742 Co-authored-by: Dhruv Nair --- src/diffusers/models/unets/unet_3d_blocks.py | 31 +++++++------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index a1d9e848c230..1495ae54ee82 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -1031,16 +1031,10 @@ def custom_forward(*inputs): hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb, scale ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(motion_module), - hidden_states.requires_grad_(), - temb, - num_frames, - ) else: hidden_states = resnet(hidden_states, temb, scale=scale) - hidden_states = motion_module(hidden_states, num_frames=num_frames)[0] + hidden_states = motion_module(hidden_states, num_frames=num_frames)[0] output_states = output_states + (hidden_states,) @@ -1221,10 +1215,10 @@ def custom_forward(*inputs): encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] - hidden_states = motion_module( - hidden_states, - num_frames=num_frames, - )[0] + hidden_states = motion_module( + hidden_states, + num_frames=num_frames, + )[0] # apply additional residuals to the output of the last pair of resnet and attention blocks if i == len(blocks) - 1 and additional_residuals is not None: @@ -1425,10 +1419,10 @@ def custom_forward(*inputs): encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] - hidden_states = motion_module( - hidden_states, - num_frames=num_frames, - )[0] + hidden_states = motion_module( + hidden_states, + num_frames=num_frames, + )[0] if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -1563,15 +1557,10 @@ def custom_forward(*inputs): hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), - hidden_states, - temb, - ) else: hidden_states = resnet(hidden_states, temb, scale=scale) - hidden_states = motion_module(hidden_states, num_frames=num_frames)[0] + hidden_states = motion_module(hidden_states, num_frames=num_frames)[0] if self.upsamplers is not None: for upsampler in self.upsamplers: From fdf55b1f1c25c7e2fedcdd0dc9c6f3db005453bb Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 5 Feb 2024 08:57:18 +0530 Subject: [PATCH 12/43] Fix posix path issue in testing utils (#6849) update --- src/diffusers/utils/testing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 02ecae22aafe..edbf6f31a833 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -522,7 +522,7 @@ def load_hf_numpy(path) -> np.ndarray: base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main" if not path.startswith("http://") and not path.startswith("https://"): - path = Path(base_url, urllib.parse.quote(path)).as_posix() + path = os.path.join(base_url, urllib.parse.quote(path)) return load_numpy(path) From bb99623d0947c8cc8701791579f28d124191c18c Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 5 Feb 2024 16:22:59 +0530 Subject: [PATCH 13/43] Update IP Adapter tests to use cosine similarity distance (#6806) * update * update --- .../test_ip_adapter_stable_diffusion.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 710dea3c2da7..84aa41f54bb6 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -35,6 +35,7 @@ from diffusers.utils import load_image from diffusers.utils.testing_utils import ( enable_full_determinism, + numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device, @@ -119,7 +120,8 @@ def test_text_to_image(self): expected_slice = np.array([0.80810547, 0.88183594, 0.9296875, 0.9189453, 0.9848633, 1.0, 0.97021484, 1.0, 1.0]) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") @@ -131,7 +133,8 @@ def test_text_to_image(self): [0.30444336, 0.26513672, 0.22436523, 0.2758789, 0.25585938, 0.20751953, 0.25390625, 0.24633789, 0.21923828] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 def test_image_to_image(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -149,7 +152,8 @@ def test_image_to_image(self): [0.22167969, 0.21875, 0.21728516, 0.22607422, 0.21948242, 0.23925781, 0.22387695, 0.25268555, 0.2722168] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") @@ -161,7 +165,8 @@ def test_image_to_image(self): [0.35913086, 0.265625, 0.26367188, 0.24658203, 0.19750977, 0.39990234, 0.15258789, 0.20336914, 0.5517578] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 def test_inpainting(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -179,7 +184,8 @@ def test_inpainting(self): [0.27148438, 0.24047852, 0.22167969, 0.23217773, 0.21118164, 0.21142578, 0.21875, 0.20751953, 0.20019531] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") @@ -187,11 +193,8 @@ def test_inpainting(self): images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.27294922, 0.24023438, 0.21948242, 0.23242188, 0.20825195, 0.2055664, 0.21679688, 0.20336914, 0.19360352] - ) - - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 def test_text_to_image_model_cpu_offload(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -233,11 +236,10 @@ def test_text_to_image_full_face(self): images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.18115234, 0.13500977, 0.13427734, 0.24194336, 0.17138672, 0.16625977, 0.4260254, 0.43359375, 0.4416504] - ) + expected_slice = np.array([0.1958, 0.1475, 0.1396, 0.2412, 0.1658, 0.1533, 0.3997, 0.4055, 0.4128]) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 def test_unload(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -277,7 +279,9 @@ def test_multi(self): expected_slice = np.array( [0.5234375, 0.53515625, 0.5629883, 0.57128906, 0.59521484, 0.62109375, 0.57910156, 0.6201172, 0.6508789] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 @slow @@ -314,7 +318,8 @@ def test_text_to_image_sdxl(self): ] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -339,7 +344,8 @@ def test_text_to_image_sdxl(self): [0.0576596, 0.05600825, 0.04479006, 0.05288461, 0.05461192, 0.05137569, 0.04867965, 0.05301541, 0.04939842] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 def test_image_to_image_sdxl(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder") @@ -432,7 +438,8 @@ def test_inpainting_sdxl(self): [0.14181179, 0.1493012, 0.14283323, 0.14602411, 0.14915377, 0.15015268, 0.14725655, 0.15009224, 0.15164584] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k") @@ -457,4 +464,5 @@ def test_inpainting_sdxl(self): expected_slice = np.array([0.1398, 0.1476, 0.1407, 0.1442, 0.1470, 0.1480, 0.1449, 0.1481, 0.1494]) - assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 From 8bf046b7fb8aa41691cb42596313038868b251ac Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 5 Feb 2024 16:23:18 +0530 Subject: [PATCH 14/43] Add single file and IP Adapter support to PIA Pipeline (#6851) update --- src/diffusers/pipelines/pia/pipeline_pia.py | 42 +++++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 802081b52182..077de49cdc87 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder from ...models.unets.unet_motion_model import MotionAdapter @@ -209,7 +209,9 @@ class PIAPipelineOutput(BaseOutput): frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] -class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): +class PIAPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-to-video generation. @@ -685,6 +687,35 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + + return image_embeds + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents def prepare_latents( self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None @@ -1107,12 +1138,9 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) if ip_adapter_image is not None: - output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True - image_embeds, negative_image_embeds = self.encode_image( - ip_adapter_image, device, num_videos_per_prompt, output_hidden_state + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, device, batch_size * num_videos_per_prompt ) - if self.do_classifier_free_guidance: - image_embeds = torch.cat([negative_image_embeds, image_embeds]) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) From 493228a70835d8c4f4afd0c8507760d71dc43eae Mon Sep 17 00:00:00 2001 From: Edward Li Date: Mon, 5 Feb 2024 14:18:22 -0500 Subject: [PATCH 15/43] Fix `AutoencoderTiny` with `use_slicing` (#6850) * Fix `AutoencoderTiny` with `use_slicing` When using slicing with AutoencoderTiny, the encoder mistakenly encodes the entire batch for every image in the batch. * Fixed formatting issue --- src/diffusers/models/autoencoders/autoencoder_tiny.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index 08b1c0e74d70..401a1f3cd1a5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -292,7 +292,9 @@ def encode( self, x: torch.FloatTensor, return_dict: bool = True ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]: if self.use_slicing and x.shape[0] > 1: - output = [self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x) for x_slice in x.split(1)] + output = [ + self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1) + ] output = torch.cat(output) else: output = self._tiled_encode(x) if self.use_tiling else self.encoder(x) From e6fd9ada3a4c45e96d170ce23e33242b3dd0cb56 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 6 Feb 2024 09:22:07 +0530 Subject: [PATCH 16/43] [I2vGenXL] clean up things (#6845) * remove _to_tensor * remove _to_tensor definition * remove _collapse_frames_into_batch * remove lora for not bloating the code. * remove sample_size. * simplify code a bit more * ensure timesteps are always in tensor. --- src/diffusers/models/unets/unet_i2vgen_xl.py | 53 +++++++++---------- .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 27 ---------- 2 files changed, 25 insertions(+), 55 deletions(-) diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 6b78968cb505..de4acb7e0d07 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -48,29 +48,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -def _to_tensor(inputs, device): - if not torch.is_tensor(inputs): - # TODO: this requires sync between CPU and GPU. So try to pass `inputs` as tensors if you can - # This would be a good case for the `match` statement (Python 3.10+) - is_mps = device.type == "mps" - if isinstance(inputs, float): - dtype = torch.float32 if is_mps else torch.float64 - else: - dtype = torch.int32 if is_mps else torch.int64 - inputs = torch.tensor([inputs], dtype=dtype, device=device) - elif len(inputs.shape) == 0: - inputs = inputs[None].to(device) - - return inputs - - -def _collapse_frames_into_batch(sample: torch.Tensor) -> torch.Tensor: - batch_size, channels, num_frames, height, width = sample.shape - sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) - - return sample - - class I2VGenXLTransformerTemporalEncoder(nn.Module): def __init__( self, @@ -174,8 +151,6 @@ def __init__( ): super().__init__() - self.sample_size = sample_size - # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -543,7 +518,18 @@ def forward( forward_upsample_size = True # 1. time - timesteps = _to_tensor(timestep, sample.device) + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass `timesteps` as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timesteps, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timesteps = timesteps.expand(sample.shape[0]) @@ -572,7 +558,13 @@ def forward( context_emb = sample.new_zeros(batch_size, 0, self.config.cross_attention_dim) context_emb = torch.cat([context_emb, encoder_hidden_states], dim=1) - image_latents_context_embs = _collapse_frames_into_batch(image_latents[:, :, :1, :]) + image_latents_for_context_embds = image_latents[:, :, :1, :] + image_latents_context_embs = image_latents_for_context_embds.permute(0, 2, 1, 3, 4).reshape( + image_latents_for_context_embds.shape[0] * image_latents_for_context_embds.shape[2], + image_latents_for_context_embds.shape[1], + image_latents_for_context_embds.shape[3], + image_latents_for_context_embds.shape[4], + ) image_latents_context_embs = self.image_latents_context_embedding(image_latents_context_embs) _batch_size, _channels, _height, _width = image_latents_context_embs.shape @@ -586,7 +578,12 @@ def forward( context_emb = torch.cat([context_emb, image_emb], dim=1) context_emb = context_emb.repeat_interleave(repeats=num_frames, dim=0) - image_latents = _collapse_frames_into_batch(image_latents) + image_latents = image_latents.permute(0, 2, 1, 3, 4).reshape( + image_latents.shape[0] * image_latents.shape[2], + image_latents.shape[1], + image_latents.shape[3], + image_latents.shape[4], + ) image_latents = self.image_latents_proj_in(image_latents) image_latents = ( image_latents[None, :] diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index 57a1449d8634..5988957cb10f 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -22,18 +22,13 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import LoraLoaderMixin from ...models import AutoencoderKL -from ...models.lora import adjust_lora_scale_text_encoder from ...models.unets.unet_i2vgen_xl import I2VGenXLUNet from ...schedulers import DDIMScheduler from ...utils import ( - USE_PEFT_BACKEND, BaseOutput, logging, replace_example_docstring, - scale_lora_layers, - unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -207,7 +202,6 @@ def encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, - lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): r""" @@ -233,23 +227,10 @@ def encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - lora_scale (`float`, *optional*): - A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. """ - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): - self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - if not USE_PEFT_BACKEND: - adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) - else: - scale_lora_layers(self.text_encoder, lora_scale) - if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -380,10 +361,6 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: - # Retrieve the original scale by scaling back the LoRA layers - unscale_lora_layers(self.text_encoder, lora_scale) - return prompt_embeds, negative_prompt_embeds def _encode_image(self, image, device, num_videos_per_prompt): @@ -706,9 +683,6 @@ def __call__( self._guidance_scale = guidance_scale # 3.1 Encode input text prompt - text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None - ) prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, device, @@ -716,7 +690,6 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, clip_skip=clip_skip, ) # For classifier free guidance, we need to do two forward passes. From 15f6b22466d833b9c08583120bcaf953c6240351 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 6 Feb 2024 14:48:07 +0530 Subject: [PATCH 17/43] add attention_head_dim --- src/diffusers/models/attention.py | 1 + src/diffusers/models/unets/unet_i2vgen_xl.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index d4d611250ad0..6a35a45373ea 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -158,6 +158,7 @@ def __init__( super().__init__() self.only_cross_attention = only_cross_attention + # We keep these boolean flags for backwards-compatibility. self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" self.use_ada_layer_norm_single = norm_type == "ada_norm_single" diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index de4acb7e0d07..45867053d4e0 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -120,6 +120,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. If `None`, normalization and activation layers is skipped in post-processing. cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features. + attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. num_attention_heads (`int`, *optional*): The number of attention heads. """ @@ -147,10 +148,16 @@ def __init__( layers_per_block: int = 2, norm_num_groups: Optional[int] = 32, cross_attention_dim: int = 1024, + attention_head_dim: Union[int, Tuple[int]] = None, num_attention_heads: Optional[Union[int, Tuple[int]]] = 64, ): super().__init__() + # We didn't define `attention_head_dim` when we first integrated this UNet. As a result, + # we had to use `num_attention_heads` in to pass values for arguments that actually denote + # attention head dimension. This is why we correct it here. + attention_head_dim = num_attention_heads or attention_head_dim + # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -172,7 +179,7 @@ def __init__( self.transformer_in = TransformerTemporalModel( num_attention_heads=8, - attention_head_dim=num_attention_heads, + attention_head_dim=attention_head_dim, in_channels=block_out_channels[0], num_layers=1, norm_num_groups=norm_num_groups, From 4f1df69d1a4faf56b58673886af4b83d919a0c6e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 6 Feb 2024 14:48:49 +0530 Subject: [PATCH 18/43] Revert "add attention_head_dim" This reverts commit 15f6b22466d833b9c08583120bcaf953c6240351. --- src/diffusers/models/attention.py | 1 - src/diffusers/models/unets/unet_i2vgen_xl.py | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 6a35a45373ea..d4d611250ad0 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -158,7 +158,6 @@ def __init__( super().__init__() self.only_cross_attention = only_cross_attention - # We keep these boolean flags for backwards-compatibility. self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" self.use_ada_layer_norm_single = norm_type == "ada_norm_single" diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 45867053d4e0..de4acb7e0d07 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -120,7 +120,6 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. If `None`, normalization and activation layers is skipped in post-processing. cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features. - attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. num_attention_heads (`int`, *optional*): The number of attention heads. """ @@ -148,16 +147,10 @@ def __init__( layers_per_block: int = 2, norm_num_groups: Optional[int] = 32, cross_attention_dim: int = 1024, - attention_head_dim: Union[int, Tuple[int]] = None, num_attention_heads: Optional[Union[int, Tuple[int]]] = 64, ): super().__init__() - # We didn't define `attention_head_dim` when we first integrated this UNet. As a result, - # we had to use `num_attention_heads` in to pass values for arguments that actually denote - # attention head dimension. This is why we correct it here. - attention_head_dim = num_attention_heads or attention_head_dim - # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -179,7 +172,7 @@ def __init__( self.transformer_in = TransformerTemporalModel( num_attention_heads=8, - attention_head_dim=attention_head_dim, + attention_head_dim=num_attention_heads, in_channels=block_out_channels[0], num_layers=1, norm_num_groups=norm_num_groups, From e6a48db633823c60ae0c1f2fa483a031df01eca3 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 6 Feb 2024 16:43:17 +0530 Subject: [PATCH 19/43] Refactor Deepfloyd IF tests. (#6855) * update * update * update --- tests/pipelines/deepfloyd_if/test_if.py | 250 +----------------- .../pipelines/deepfloyd_if/test_if_img2img.py | 46 +++- .../test_if_img2img_superresolution.py | 58 +++- .../deepfloyd_if/test_if_inpainting.py | 51 +++- .../test_if_inpainting_superresolution.py | 58 +++- .../deepfloyd_if/test_if_superresolution.py | 52 +++- 6 files changed, 266 insertions(+), 249 deletions(-) diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index 2e7383067eec..26674861eda5 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -14,22 +14,16 @@ # limitations under the License. import gc -import random import unittest import torch from diffusers import ( - IFImg2ImgPipeline, - IFImg2ImgSuperResolutionPipeline, - IFInpaintingPipeline, - IFInpaintingSuperResolutionPipeline, IFPipeline, - IFSuperResolutionPipeline, ) from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device +from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference @@ -97,77 +91,18 @@ def tearDown(self): gc.collect() torch.cuda.empty_cache() - def test_all(self): - # if + def test_if_text_to_image(self): + pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() - pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) - - pipe_2 = IFSuperResolutionPipeline.from_pretrained( - "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None - ) - - # pre compute text embeddings and remove T5 to save memory - - pipe_1.text_encoder.to("cuda") - - prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle", device="cuda") - - del pipe_1.tokenizer - del pipe_1.text_encoder - gc.collect() - - pipe_1.tokenizer = None - pipe_1.text_encoder = None - - pipe_1.enable_model_cpu_offload() - pipe_2.enable_model_cpu_offload() - - pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) - - self._test_if(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) - - pipe_1.remove_all_hooks() - pipe_2.remove_all_hooks() - - # img2img - - pipe_1 = IFImg2ImgPipeline(**pipe_1.components) - pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components) - - pipe_1.enable_model_cpu_offload() - pipe_2.enable_model_cpu_offload() - - pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) - - self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) - - pipe_1.remove_all_hooks() - pipe_2.remove_all_hooks() - - # inpainting - - pipe_1 = IFInpaintingPipeline(**pipe_1.components) - pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components) - - pipe_1.enable_model_cpu_offload() - pipe_2.enable_model_cpu_offload() - - pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) - - self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) - - def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): - # pipeline 1 - - _start_torch_memory_measurement() + torch.cuda.reset_max_memory_allocated() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe_1( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, + output = pipe( + prompt="anime turtle", num_inference_steps=2, generator=generator, output_type="np", @@ -175,172 +110,11 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): image = output.images[0] - assert image.shape == (64, 64, 3) - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 13 * 10**9 + assert mem_bytes < 12 * 10**9 expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if.npy" ) assert_mean_pixel_difference(image, expected_image) - - # pipeline 2 - - _start_torch_memory_measurement() - - generator = torch.Generator(device="cpu").manual_seed(0) - - image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) - - output = pipe_2( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=image, - generator=generator, - num_inference_steps=2, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 4 * 10**9 - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy" - ) - assert_mean_pixel_difference(image, expected_image) - - def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): - # pipeline 1 - - _start_torch_memory_measurement() - - image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) - - generator = torch.Generator(device="cpu").manual_seed(0) - - output = pipe_1( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=image, - num_inference_steps=2, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (64, 64, 3) - - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 10 * 10**9 - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy" - ) - assert_mean_pixel_difference(image, expected_image) - - # pipeline 2 - - _start_torch_memory_measurement() - - generator = torch.Generator(device="cpu").manual_seed(0) - - original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) - image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) - - output = pipe_2( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=image, - original_image=original_image, - generator=generator, - num_inference_steps=2, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 4 * 10**9 - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy" - ) - assert_mean_pixel_difference(image, expected_image) - - def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): - # pipeline 1 - - _start_torch_memory_measurement() - - image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) - mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device) - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe_1( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=image, - mask_image=mask_image, - num_inference_steps=2, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (64, 64, 3) - - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 10 * 10**9 - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy" - ) - assert_mean_pixel_difference(image, expected_image) - - # pipeline 2 - - _start_torch_memory_measurement() - - generator = torch.Generator(device="cpu").manual_seed(0) - - image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) - original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) - mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1)).to(torch_device) - - output = pipe_2( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=image, - mask_image=mask_image, - original_image=original_image, - generator=generator, - num_inference_steps=2, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 4 * 10**9 - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy" - ) - assert_mean_pixel_difference(image, expected_image) - - -def _start_torch_memory_measurement(): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + pipe.remove_all_hooks() diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index bfb70c5c9b98..85d6511686d1 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest import torch from diffusers import IFImg2ImgPipeline +from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import ( TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, ) -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin @@ -87,3 +89,43 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical( expected_max_diff=1e-2, ) + + +@slow +@require_torch_gpu +class IFImg2ImgPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_if_img2img(self): + pipe = IFImg2ImgPipeline.from_pretrained( + "DeepFloyd/IF-I-L-v1.0", + variant="fp16", + torch_dtype=torch.float16, + ) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + generator = torch.Generator(device="cpu").manual_seed(0) + output = pipe( + prompt="anime turtle", + image=image, + num_inference_steps=2, + generator=generator, + output_type="np", + ) + image = output.images[0] + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 12 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + pipe.remove_all_hooks() diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index f35f3e945609..d2a1f6bc1114 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -13,17 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest import torch from diffusers import IFImg2ImgSuperResolutionPipeline +from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device -from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin @@ -82,3 +87,50 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical( expected_max_diff=1e-2, ) + + +@slow +@require_torch_gpu +class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_if_img2img_superresolution(self): + pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", + variant="fp16", + torch_dtype=torch.float16, + ) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() + + generator = torch.Generator(device="cpu").manual_seed(0) + + original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + + output = pipe( + prompt="anime turtle", + image=image, + original_image=original_image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 12 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + pipe.remove_all_hooks() diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 68753c0ac1cd..9dab6565b79e 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest import torch from diffusers import IFInpaintingPipeline +from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, ) -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin @@ -85,3 +87,48 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical( expected_max_diff=1e-2, ) + + +@slow +@require_torch_gpu +class IFInpaintingPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_if_inpainting(self): + pipe = IFInpaintingPipeline.from_pretrained( + "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16 + ) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() + + # Super resolution test + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device) + + generator = torch.Generator(device="cpu").manual_seed(0) + output = pipe( + prompt="anime prompts", + image=image, + mask_image=mask_image, + num_inference_steps=2, + generator=generator, + output_type="np", + ) + image = output.images[0] + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 12 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy" + ) + assert_mean_pixel_difference(image, expected_image) + pipe.remove_all_hooks() diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index 03b92e0d783c..987513de8314 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest import torch from diffusers import IFInpaintingSuperResolutionPipeline +from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, ) -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin @@ -87,3 +89,55 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical( expected_max_diff=1e-2, ) + + +@slow +@require_torch_gpu +class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_if_inpainting_superresolution(self): + pipe = IFInpaintingSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16 + ) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() + + # Super resolution test + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + + generator = torch.Generator(device="cpu").manual_seed(0) + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) + mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1)).to(torch_device) + + output = pipe( + prompt="anime turtle", + image=image, + original_image=original_image, + mask_image=mask_image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 12 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + pipe.remove_all_hooks() diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index 5a74148e6661..e72b32c5057f 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -13,17 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest import torch from diffusers import IFSuperResolutionPipeline +from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin @@ -80,3 +82,49 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical( expected_max_diff=1e-2, ) + + +@slow +@require_torch_gpu +class IFSuperResolutionPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_if_superresolution(self): + pipe = IFSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16 + ) + pipe.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe.enable_model_cpu_offload() + + # Super resolution test + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + generator = torch.Generator(device="cpu").manual_seed(0) + output = pipe( + prompt="anime turtle", + image=image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 12 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + pipe.remove_all_hooks() From 994360f7a57e3f10e961cc8ab3bbcad2fa37634b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 7 Feb 2024 00:23:40 +0530 Subject: [PATCH 20/43] Fix last IP Adapter test (#6875) update --- .../ip_adapters/test_ip_adapter_stable_diffusion.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 84aa41f54bb6..cb056a3561d7 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -235,8 +235,7 @@ def test_text_to_image_full_face(self): inputs = self.get_dummy_inputs() images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - - expected_slice = np.array([0.1958, 0.1475, 0.1396, 0.2412, 0.1658, 0.1533, 0.3997, 0.4055, 0.4128]) + expected_slice = np.array([0.1704, 0.1296, 0.1272, 0.2212, 0.1514, 0.1479, 0.4172, 0.4263, 0.4360]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -276,9 +275,7 @@ def test_multi(self): inputs["ip_adapter_image"] = [ip_adapter_image, [ip_adapter_image] * 2] images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.5234375, 0.53515625, 0.5629883, 0.57128906, 0.59521484, 0.62109375, 0.57910156, 0.6201172, 0.6508789] - ) + expected_slice = np.array([0.1704, 0.1296, 0.1272, 0.2212, 0.1514, 0.1479, 0.4172, 0.4263, 0.4360]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 From 17612de451244c4c169b1498f1333401dfd3106f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Sanz?= Date: Tue, 6 Feb 2024 21:05:40 +0100 Subject: [PATCH 21/43] fix: typo in callback function name and property (#6834) * fix: callback function name is incorrect On this tutorial there is a function defined and then used inside `callback_on_step_end` argument, but the name was not correct (mismatch) * fix: typo in num_timestep (correct is num_timesteps) fixed property name --- docs/source/en/using-diffusers/callback.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/en/using-diffusers/callback.md b/docs/source/en/using-diffusers/callback.md index c2133fb8f7b0..dfa4395e5317 100644 --- a/docs/source/en/using-diffusers/callback.md +++ b/docs/source/en/using-diffusers/callback.md @@ -18,8 +18,8 @@ This guide will show you how to use the `callback_on_step_end` parameter to disa The callback function should have the following arguments: -* `pipe` (or the pipeline instance) provides access to useful properties such as `num_timestep` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipe._guidance_scale=0.0`. -* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timestep`. +* `pipe` (or the pipeline instance) provides access to useful properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipe._guidance_scale=0.0`. +* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`. * `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly. Your callback function should look something like this: @@ -27,7 +27,7 @@ Your callback function should look something like this: ```python def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs): # adjust the batch_size of prompt_embeds according to guidance_scale - if step_index == int(pipe.num_timestep * 0.4): + if step_index == int(pipe.num_timesteps * 0.4): prompt_embeds = callback_kwargs["prompt_embeds"] prompt_embeds = prompt_embeds.chunk(2)[-1] @@ -49,7 +49,7 @@ pipe = pipe.to("cuda") prompt = "a photo of an astronaut riding a horse on mars" generator = torch.Generator(device="cuda").manual_seed(1) -out = pipe(prompt, generator=generator, callback_on_step_end=callback_custom_cfg, callback_on_step_end_tensor_inputs=['prompt_embeds']) +out = pipe(prompt, generator=generator, callback_on_step_end=callback_dynamic_cfg, callback_on_step_end_tensor_inputs=['prompt_embeds']) out.images[0].save("out_custom_cfg.png") ``` From 76696dca558267999abf3e7c29e1a256cbcb407a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 7 Feb 2024 15:07:11 +0530 Subject: [PATCH 22/43] [Model Card] standardize dreambooth model card (#6729) * feat: standarize model card creation for dreambooth training. * correct 'inference * remove comments. * take component out of kwargs * style * add: card template to have a leaner description. * widget support. * propagate changes to train_dreambooth_lora * propagate changes to custom diffusion * make widget properly type-annotated --- .../train_custom_diffusion.py | 33 ++++----- examples/dreambooth/train_dreambooth.py | 54 +++++++------- examples/dreambooth/train_dreambooth_lora.py | 36 ++++----- .../dreambooth/train_dreambooth_lora_sdxl.py | 64 ++++++++-------- src/diffusers/utils/hub_utils.py | 74 +++++++++++++++---- src/diffusers/utils/model_card_template.md | 24 ++++++ 6 files changed, 180 insertions(+), 105 deletions(-) create mode 100644 src/diffusers/utils/model_card_template.md diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 559430eba177..0ea2732a959f 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -58,6 +58,7 @@ ) from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available +from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card from diffusers.utils.import_utils import is_xformers_available @@ -78,21 +79,7 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_ image.save(os.path.join(repo_folder, f"image_{i}.png")) img_str += f"![img_{i}](./image_{i}.png)\n" - yaml = f""" ---- -license: creativeml-openrail-m -base_model: {base_model} -instance_prompt: {prompt} -tags: -- stable-diffusion -- stable-diffusion-diffusers -- text-to-image -- diffusers -- custom-diffusion -inference: true ---- - """ - model_card = f""" + model_description = f""" # Custom Diffusion - {repo_id} These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n @@ -100,8 +87,20 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_ \nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). """ - with open(os.path.join(repo_folder, "README.md"), "w") as f: - f.write(yaml + model_card) + model_card = load_or_create_model_card( + repo_id_or_path=repo_id, + from_training=True, + license="creativeml-openrail-m", + base_model=base_model, + instance_prompt=prompt, + model_description=model_description, + inference=True, + ) + + tags = ["text-to-image", "diffusers", "stable-diffusion", "stable-diffusion-diffusers", "custom-diffusion"] + model_card = populate_model_card(model_card, tags=tags) + + model_card.save(os.path.join(repo_folder, "README.md")) def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 532e134a6153..4d899ef56e37 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -54,6 +54,7 @@ from diffusers.optimization import get_scheduler from diffusers.training_utils import compute_snr from diffusers.utils import check_min_version, is_wandb_available +from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -69,33 +70,20 @@ def save_model_card( repo_id: str, - images=None, - base_model=str, + images: list = None, + base_model: str = None, train_text_encoder=False, - prompt=str, - repo_folder=None, + prompt: str = None, + repo_folder: str = None, pipeline: DiffusionPipeline = None, ): img_str = "" - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f"![img_{i}](./image_{i}.png)\n" - - yaml = f""" ---- -license: creativeml-openrail-m -base_model: {base_model} -instance_prompt: {prompt} -tags: -- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'} -- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'} -- text-to-image -- diffusers -- dreambooth -inference: true ---- - """ - model_card = f""" + if images is not None: + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" + + model_description = f""" # DreamBooth - {repo_id} This is a dreambooth model derived from {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). @@ -104,8 +92,24 @@ def save_model_card( DreamBooth for the text encoder was enabled: {train_text_encoder}. """ - with open(os.path.join(repo_folder, "README.md"), "w") as f: - f.write(yaml + model_card) + model_card = load_or_create_model_card( + repo_id_or_path=repo_id, + from_training=True, + license="creativeml-openrail-m", + base_model=base_model, + instance_prompt=prompt, + model_description=model_description, + inference=True, + ) + + tags = ["text-to-image", "dreambooth"] + if isinstance(pipeline, StableDiffusionPipeline): + tags.extend(["stable-diffusion", "stable-diffusion-diffusers"]) + else: + tags.extend(["if", "if-diffusers"]) + model_card = populate_model_card(model_card, tags=tags) + + model_card.save(os.path.join(repo_folder, "README.md")) def log_validation( diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 3724e3d140d9..f0c47821b0c9 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -61,6 +61,7 @@ convert_unet_state_dict_to_peft, is_wandb_available, ) +from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -85,21 +86,7 @@ def save_model_card( image.save(os.path.join(repo_folder, f"image_{i}.png")) img_str += f"![img_{i}](./image_{i}.png)\n" - yaml = f""" ---- -license: creativeml-openrail-m -base_model: {base_model} -instance_prompt: {prompt} -tags: -- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'} -- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'} -- text-to-image -- diffusers -- lora -inference: true ---- - """ - model_card = f""" + model_description = f""" # LoRA DreamBooth - {repo_id} These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n @@ -107,8 +94,23 @@ def save_model_card( LoRA for the text encoder was enabled: {train_text_encoder}. """ - with open(os.path.join(repo_folder, "README.md"), "w") as f: - f.write(yaml + model_card) + model_card = load_or_create_model_card( + repo_id_or_path=repo_id, + from_training=True, + license="creativeml-openrail-m", + base_model=base_model, + instance_prompt=prompt, + model_description=model_description, + inference=True, + ) + tags = ["text-to-image", "diffusers", "lora"] + if isinstance(pipeline, StableDiffusionPipeline): + tags.extend(["stable-diffusion", "stable-diffusion-diffusers"]) + else: + tags.extend(["if", "if-diffusers"]) + model_card = populate_model_card(model_card, tags=tags) + + model_card.save(os.path.join(repo_folder, "README.md")) def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index aa09bf9a0ebf..8df61f132510 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -62,6 +62,7 @@ convert_unet_state_dict_to_peft, is_wandb_available, ) +from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -75,40 +76,22 @@ def save_model_card( repo_id: str, images=None, - base_model=str, + base_model: str = None, train_text_encoder=False, - instance_prompt=str, - validation_prompt=str, + instance_prompt=None, + validation_prompt=None, repo_folder=None, vae_path=None, ): - img_str = "widget:\n" if images else "" - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f""" - - text: '{validation_prompt if validation_prompt else ' ' }' - output: - url: - "image_{i}.png" - """ - - yaml = f""" ---- -tags: -- stable-diffusion-xl -- stable-diffusion-xl-diffusers -- text-to-image -- diffusers -- lora -- template:sd-lora -{img_str} -base_model: {base_model} -instance_prompt: {instance_prompt} -license: openrail++ ---- - """ + widget_dict = [] + if images is not None: + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + widget_dict.append( + {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}} + ) - model_card = f""" + model_description = f""" # SDXL LoRA DreamBooth - {repo_id} @@ -134,8 +117,27 @@ def save_model_card( [Download]({repo_id}/tree/main) them in the Files & versions tab. """ - with open(os.path.join(repo_folder, "README.md"), "w") as f: - f.write(yaml + model_card) + model_card = load_or_create_model_card( + repo_id_or_path=repo_id, + from_training=True, + license="openrail++", + base_model=base_model, + instance_prompt=instance_prompt, + model_description=model_description, + widget=widget_dict, + ) + tags = [ + "text-to-image", + "stable-diffusion-xl", + "stable-diffusion-xl-diffusers", + "text-to-image", + "diffusers", + "lora", + "template:sd-lora", + ] + model_card = populate_model_card(model_card, tags=tags) + + model_card.save(os.path.join(repo_folder, "README.md")) def import_model_class_from_model_name_or_path( diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index c6a45b569218..7528e34d3332 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -21,7 +21,7 @@ import traceback import warnings from pathlib import Path -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union from uuid import uuid4 from huggingface_hub import ( @@ -65,7 +65,7 @@ logger = get_logger(__name__) - +MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md" SESSION_ID = uuid4().hex @@ -94,43 +94,87 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str: def load_or_create_model_card( - repo_id_or_path: Optional[str] = None, token: Optional[str] = None, is_pipeline: bool = False + repo_id_or_path: str = None, + token: Optional[str] = None, + is_pipeline: bool = False, + from_training: bool = False, + model_description: Optional[str] = None, + base_model: str = None, + prompt: Optional[str] = None, + license: Optional[str] = None, + widget: Optional[List[dict]] = None, + inference: Optional[bool] = None, ) -> ModelCard: """ Loads or creates a model card. Args: - repo_id (`str`): - The repo_id where to look for the model card. + repo_id_or_path (`str`): + The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card. token (`str`, *optional*): Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more details. - is_pipeline (`bool`, *optional*): + is_pipeline (`bool`): Boolean to indicate if we're adding tag to a [`DiffusionPipeline`]. + from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script. + model_description (`str`, *optional*): Model description to add to the model card. Helpful when using + `load_or_create_model_card` from a training script. + base_model (`str`): Base model identifier (e.g., "stabilityai/stable-diffusion-xl-base-1.0"). Useful + for DreamBooth-like training. + prompt (`str`, *optional*): Prompt used for training. Useful for DreamBooth-like training. + license: (`str`, *optional*): License of the output artifact. Helpful when using + `load_or_create_model_card` from a training script. + widget (`List[dict]`, *optional*): Widget to accompany a gallery template. + inference: (`bool`, optional): Whether to turn on inference widget. Helpful when using + `load_or_create_model_card` from a training script. """ if not is_jinja_available(): raise ValueError( "Modelcard rendering is based on Jinja templates." - " Please make sure to have `jinja` installed before using `create_model_card`." + " Please make sure to have `jinja` installed before using `load_or_create_model_card`." " To install it, please run `pip install Jinja2`." ) try: # Check if the model card is present on the remote repo model_card = ModelCard.load(repo_id_or_path, token=token) - except EntryNotFoundError: - # Otherwise create a simple model card from template - component = "pipeline" if is_pipeline else "model" - model_description = f"This is the model card of a 🧨 diffusers {component} that has been pushed on the Hub. This model card has been automatically generated." - card_data = ModelCardData() - model_card = ModelCard.from_template(card_data, model_description=model_description) + except (EntryNotFoundError, RepositoryNotFoundError): + # Otherwise create a model card from template + if from_training: + model_card = ModelCard.from_template( + card_data=ModelCardData( # Card metadata object that will be converted to YAML block + license=license, + library_name="diffusers", + inference=inference, + base_model=base_model, + instance_prompt=prompt, + widget=widget, + ), + template_path=MODEL_CARD_TEMPLATE_PATH, + model_description=model_description, + ) + else: + card_data = ModelCardData() + component = "pipeline" if is_pipeline else "model" + if model_description is None: + model_description = f"This is the model card of a 🧨 diffusers {component} that has been pushed on the Hub. This model card has been automatically generated." + model_card = ModelCard.from_template(card_data, model_description=model_description) return model_card -def populate_model_card(model_card: ModelCard) -> ModelCard: - """Populates the `model_card` with library name.""" +def populate_model_card(model_card: ModelCard, tags: Union[str, List[str]] = None) -> ModelCard: + """Populates the `model_card` with library name and optional tags.""" if model_card.data.library_name is None: model_card.data.library_name = "diffusers" + + if tags is not None: + if isinstance(tags, str): + tags = [tags] + if model_card.data.tags is None: + model_card.data.tags = [] + for tag in tags: + model_card.data.tags.append(tag) + return model_card diff --git a/src/diffusers/utils/model_card_template.md b/src/diffusers/utils/model_card_template.md new file mode 100644 index 000000000000..f41b71e24e20 --- /dev/null +++ b/src/diffusers/utils/model_card_template.md @@ -0,0 +1,24 @@ +--- +{{ card_data }} +--- + + + +{{ model_description }} + +## Intended uses & limitations + +#### How to use + +```python +# TODO: add an example code snippet for running this diffusion pipeline +``` + +#### Limitations and bias + +[TODO: provide examples of latent issues and potential remediations] + +## Training details + +[TODO: describe the data used to train the model] From 97d004b9b4283d194a4d2c410be2d441527e0800 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 7 Feb 2024 10:13:12 -1000 Subject: [PATCH 23/43] [ip-adapter] make sure length of `scale` is same as number of ip-adapters when using `set_ip_adapter_scale` (#6884) add Co-authored-by: yiyixuxu --- src/diffusers/loaders/ip_adapter.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 679c46d57e2a..d386c0bf071a 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -181,11 +181,16 @@ def load_ip_adapter( unet._load_ip_adapter_weights(state_dicts) def set_ip_adapter_scale(self, scale): - if not isinstance(scale, list): - scale = [scale] unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet for attn_processor in unet.attn_processors.values(): if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): + if not isinstance(scale, list): + scale = [scale] * len(attn_processor.scale) + if len(attn_processor.scale) != len(scale): + raise ValueError( + f"`scale` should be a list of same length as the number if ip-adapters " + f"Expected {len(attn_processor.scale)} but got {len(scale)}." + ) attn_processor.scale = scale def unload_ip_adapter(self): From 4a3d52850b2a1a3da47c91525b8899465b76606e Mon Sep 17 00:00:00 2001 From: camaro <37071446+bamps53@users.noreply.github.com> Date: Thu, 8 Feb 2024 13:07:56 +0900 Subject: [PATCH 24/43] fix: keyword argument mismatch (#6895) --- examples/custom_diffusion/train_custom_diffusion.py | 2 +- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- examples/dreambooth/train_dreambooth_lora_sdxl.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 0ea2732a959f..0e59db211492 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -92,7 +92,7 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_ from_training=True, license="creativeml-openrail-m", base_model=base_model, - instance_prompt=prompt, + prompt=prompt, model_description=model_description, inference=True, ) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 4d899ef56e37..4847e214bc48 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -97,7 +97,7 @@ def save_model_card( from_training=True, license="creativeml-openrail-m", base_model=base_model, - instance_prompt=prompt, + prompt=prompt, model_description=model_description, inference=True, ) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index f0c47821b0c9..5aa2bc16b84e 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -99,7 +99,7 @@ def save_model_card( from_training=True, license="creativeml-openrail-m", base_model=base_model, - instance_prompt=prompt, + prompt=prompt, model_description=model_description, inference=True, ) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index 8df61f132510..46bf40413d7c 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -122,7 +122,7 @@ def save_model_card( from_training=True, license="openrail++", base_model=base_model, - instance_prompt=instance_prompt, + prompt=instance_prompt, model_description=model_description, widget=widget_dict, ) From 1835510524a058da1d3b3c5f951bbaaae5b21421 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 8 Feb 2024 09:38:57 +0530 Subject: [PATCH 25/43] Remove `torch_dtype` in to() to end deprecation (#6886) * remove torch_dtype from to() * remove torch_dtype from usage scripts. * remove old lora backend * Revert "remove old lora backend" This reverts commit adcddf6ba421f847e7da2a0ce57b9456cae43356. --- scripts/convert_gligen_to_diffusers.py | 2 +- ..._original_stable_diffusion_to_diffusers.py | 2 +- scripts/convert_zero123_to_diffusers.py | 2 +- src/diffusers/pipelines/pipeline_utils.py | 30 +++---------------- .../pipelines/animatediff/test_animatediff.py | 2 +- .../test_animatediff_video2video.py | 2 +- tests/pipelines/audioldm2/test_audioldm2.py | 2 +- tests/pipelines/musicldm/test_musicldm.py | 2 +- tests/pipelines/pia/test_pia.py | 2 +- .../test_stable_video_diffusion.py | 2 +- tests/pipelines/test_pipelines.py | 2 +- tests/pipelines/test_pipelines_common.py | 2 +- 12 files changed, 15 insertions(+), 37 deletions(-) diff --git a/scripts/convert_gligen_to_diffusers.py b/scripts/convert_gligen_to_diffusers.py index 30d789b60634..83c1f928e407 100644 --- a/scripts/convert_gligen_to_diffusers.py +++ b/scripts/convert_gligen_to_diffusers.py @@ -576,6 +576,6 @@ def convert_gligen_to_diffusers( ) if args.half: - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) pipe.save_pretrained(args.dump_path) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index 2ca70963d132..980446179cfd 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -179,7 +179,7 @@ ) if args.half: - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) if args.controlnet: # only save the controlnet model diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py index f016312b8bb6..3bb6f6c041c9 100644 --- a/scripts/convert_zero123_to_diffusers.py +++ b/scripts/convert_zero123_to_diffusers.py @@ -801,6 +801,6 @@ def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, ex ) if args.half: - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 06187645f000..769fcd2e832a 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -775,32 +775,10 @@ def to(self, *args, **kwargs): Returns: [`DiffusionPipeline`]: The pipeline converted to specified `dtype` and/or `dtype`. """ - - torch_dtype = kwargs.pop("torch_dtype", None) - if torch_dtype is not None: - deprecate("torch_dtype", "0.27.0", "") - torch_device = kwargs.pop("torch_device", None) - if torch_device is not None: - deprecate("torch_device", "0.27.0", "") - - dtype_kwarg = kwargs.pop("dtype", None) - device_kwarg = kwargs.pop("device", None) + dtype = kwargs.pop("dtype", None) + device = kwargs.pop("device", None) silence_dtype_warnings = kwargs.pop("silence_dtype_warnings", False) - if torch_dtype is not None and dtype_kwarg is not None: - raise ValueError( - "You have passed both `torch_dtype` and `dtype` as a keyword argument. Please make sure to only pass `dtype`." - ) - - dtype = torch_dtype or dtype_kwarg - - if torch_device is not None and device_kwarg is not None: - raise ValueError( - "You have passed both `torch_device` and `device` as a keyword argument. Please make sure to only pass `device`." - ) - - device = torch_device or device_kwarg - dtype_arg = None device_arg = None if len(args) == 1: @@ -873,12 +851,12 @@ def module_is_offloaded(module): if is_loaded_in_8bit and dtype is not None: logger.warning( - f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {torch_dtype} is not yet supported. Module is still in 8bit precision." + f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {dtype} is not yet supported. Module is still in 8bit precision." ) if is_loaded_in_8bit and device is not None: logger.warning( - f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {torch_dtype} via `.to()` is not yet supported. Module is still on {module.device}." + f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {dtype} via `.to()` is not yet supported. Module is still on {module.device}." ) else: module.to(device, dtype) diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index 80a8fd19f5a0..525ca24bbd9a 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -218,7 +218,7 @@ def test_to_dtype(self): model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) diff --git a/tests/pipelines/animatediff/test_animatediff_video2video.py b/tests/pipelines/animatediff/test_animatediff_video2video.py index 3226bdb3ca6e..767fc30b4eb5 100644 --- a/tests/pipelines/animatediff/test_animatediff_video2video.py +++ b/tests/pipelines/animatediff/test_animatediff_video2video.py @@ -224,7 +224,7 @@ def test_to_dtype(self): model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index 60ef86518e35..e2655515bc40 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -483,7 +483,7 @@ def test_to_dtype(self): self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) # Once we send to fp16, all params are in half-precision, including the logit scale - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py index 4bf03569bbf3..fe78ab6acbb1 100644 --- a/tests/pipelines/musicldm/test_musicldm.py +++ b/tests/pipelines/musicldm/test_musicldm.py @@ -400,7 +400,7 @@ def test_to_dtype(self): self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) # Once we send to fp16, all params are in half-precision, including the logit scale - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) diff --git a/tests/pipelines/pia/test_pia.py b/tests/pipelines/pia/test_pia.py index eb76457abc9d..edd129560c63 100644 --- a/tests/pipelines/pia/test_pia.py +++ b/tests/pipelines/pia/test_pia.py @@ -231,7 +231,7 @@ def test_to_dtype(self): model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py index 871266fb9c24..60c411283803 100644 --- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py +++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py @@ -396,7 +396,7 @@ def test_to_dtype(self): model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 32ae81ddc2d8..bd9f42f185e6 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1623,7 +1623,7 @@ def test_pipe_to(self): sd1 = sd.to(torch.float16) sd2 = sd.to(None, torch.float16) sd3 = sd.to(dtype=torch.float16) - sd4 = sd.to(torch_dtype=torch.float16) + sd4 = sd.to(dtype=torch.float16) sd5 = sd.to(None, dtype=torch.float16) sd6 = sd.to(None, torch_dtype=torch.float16) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index e3c8a4ef503f..7f51847caf07 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -716,7 +716,7 @@ def test_to_dtype(self): model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - pipe.to(torch_dtype=torch.float16) + pipe.to(dtype=torch.float16) model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) From a11b0f83b741b7cb496e9ff02033449c40d30b6b Mon Sep 17 00:00:00 2001 From: Srimanth Agastyaraju <30816357+asrimanth@users.noreply.github.com> Date: Thu, 8 Feb 2024 00:39:29 -0500 Subject: [PATCH 26/43] Fix: training resume from fp16 for SDXL Consistency Distillation (#6840) * Fix: training resume from fp16 for lcm distill lora sdxl * Fix coding quality - run linter * Fix 1 - shift mixed precision cast before optimizer * Fix 2 - State dict errors by removing load_lora_into_unet * Update train_lcm_distill_lora_sdxl.py - Revert default cache dir to None --------- Co-authored-by: Sayak Paul --- .../train_lcm_distill_lora_sdxl.py | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py index 44a58fa2a815..34de0d048e8b 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py @@ -36,7 +36,7 @@ from datasets import load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version -from peft import LoraConfig, get_peft_model_state_dict +from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict from torchvision import transforms from torchvision.transforms.functional import crop from tqdm.auto import tqdm @@ -52,7 +52,12 @@ ) from diffusers.optimization import get_scheduler from diffusers.training_utils import cast_training_params, resolve_interpolation_mode -from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available +from diffusers.utils import ( + check_min_version, + convert_state_dict_to_diffusers, + convert_unet_state_dict_to_peft, + is_wandb_available, +) from diffusers.utils.import_utils import is_xformers_available @@ -858,11 +863,6 @@ def main(args): ) unet.add_adapter(lora_config) - # Make sure the trainable params are in float32. - if args.mixed_precision == "fp16": - # only upcast trainable parameters (LoRA) into fp32 - cast_training_params(unet, dtype=torch.float32) - # Also move the alpha and sigma noise schedules to accelerator.device. alpha_schedule = alpha_schedule.to(accelerator.device) sigma_schedule = sigma_schedule.to(accelerator.device) @@ -887,13 +887,31 @@ def save_model_hook(models, weights, output_dir): def load_model_hook(models, input_dir): # load the LoRA into the model unet_ = accelerator.unwrap_model(unet) - lora_state_dict, network_alphas = StableDiffusionXLPipeline.lora_state_dict(input_dir) - StableDiffusionXLPipeline.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_) + lora_state_dict, _ = StableDiffusionXLPipeline.lora_state_dict(input_dir) + unet_state_dict = { + f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.") + } + unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) + incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default") + if incompatible_keys is not None: + # check only for unexpected keys + unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) + if unexpected_keys: + logger.warning( + f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " + f" {unexpected_keys}. " + ) for _ in range(len(models)): # pop models so that they are not loaded again models.pop() + # Make sure the trainable params are in float32. This is again needed since the base models + # are in `weight_dtype`. More details: + # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804 + if args.mixed_precision == "fp16": + cast_training_params(unet_, dtype=torch.float32) + accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) @@ -1092,6 +1110,11 @@ def compute_time_ids(original_size, crops_coords_top_left): args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes ) + # Make sure the trainable params are in float32. + if args.mixed_precision == "fp16": + # only upcast trainable parameters (LoRA) into fp32 + cast_training_params(unet, dtype=torch.float32) + lr_scheduler = get_scheduler( args.lr_scheduler, optimizer=optimizer, From aa82df52e719f22a51f2881ebe15d2904586160a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 8 Feb 2024 11:10:10 +0530 Subject: [PATCH 27/43] [IP Adapters] introduce `ip_adapter_image_embeds` in the SD pipeline call (#6868) * add: support for passing ip adapter image embeddings * debugging * make feature_extractor unloading conditioned on safety_checker * better condition * type annotation * index to look into value slices * more debugging * debugging * serialize embeddings dict * better conditioning * remove unnecessary prints. * Update src/diffusers/loaders/ip_adapter.py Co-authored-by: YiYi Xu * make fix-copies and styling. * styling and further copy fixing. * fix: check_inputs call in controlnet sdxl img2img pipeline --------- Co-authored-by: YiYi Xu --- src/diffusers/loaders/ip_adapter.py | 10 ++- .../animatediff/pipeline_animatediff.py | 67 ++++++++++------ .../controlnet/pipeline_controlnet.py | 67 ++++++++++------ .../controlnet/pipeline_controlnet_img2img.py | 67 ++++++++++------ .../controlnet/pipeline_controlnet_inpaint.py | 67 ++++++++++------ .../controlnet/pipeline_controlnet_sd_xl.py | 69 ++++++++++------ .../pipeline_controlnet_sd_xl_img2img.py | 69 ++++++++++------ .../pipeline_cycle_diffusion.py | 1 - ...ipeline_stable_diffusion_inpaint_legacy.py | 1 - ...pipeline_stable_diffusion_model_editing.py | 1 - .../pipeline_stable_diffusion_paradigms.py | 1 - ...eline_versatile_diffusion_text_to_image.py | 1 - .../pipeline_latent_consistency_img2img.py | 77 +++++++++++------- src/diffusers/pipelines/pia/pipeline_pia.py | 65 +++++++++------ .../pipeline_semantic_stable_diffusion.py | 1 - .../pipeline_stable_diffusion.py | 73 +++++++++++------ .../pipeline_stable_diffusion_depth2img.py | 1 - .../pipeline_stable_diffusion_img2img.py | 67 ++++++++++------ .../pipeline_stable_diffusion_inpaint.py | 67 ++++++++++------ ...line_stable_diffusion_gligen_text_image.py | 8 +- .../pipeline_stable_diffusion_k_diffusion.py | 1 - .../pipeline_stable_diffusion_ldm3d.py | 80 +++++++++++++------ .../pipeline_stable_diffusion_panorama.py | 75 +++++++++++------ .../pipeline_stable_diffusion_safe.py | 1 - .../pipeline_stable_diffusion_sag.py | 1 - .../pipeline_stable_diffusion_xl.py | 69 ++++++++++------ .../pipeline_stable_diffusion_xl_img2img.py | 69 ++++++++++------ .../pipeline_stable_diffusion_xl_inpaint.py | 69 ++++++++++------ .../pipeline_stable_diffusion_xl_adapter.py | 69 ++++++++++------ .../pipeline_text_to_video_synth.py | 1 - .../pipeline_text_to_video_synth_img2img.py | 1 - .../pipeline_text_to_video_zero.py | 1 - .../pipeline_text_to_video_zero_sdxl.py | 1 - 33 files changed, 779 insertions(+), 439 deletions(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index d386c0bf071a..370200441ea4 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -210,10 +210,12 @@ def unload_ip_adapter(self): self.image_encoder = None self.register_to_config(image_encoder=[None, None]) - # remove feature extractor - if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None: - self.feature_extractor = None - self.register_to_config(feature_extractor=[None, None]) + # remove feature extractor only when safety_checker is None as safety_checker uses + # the feature_extractor later + if not hasattr(self, "safety_checker"): + if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None: + self.feature_extractor = None + self.register_to_config(feature_extractor=[None, None]) # remove hidden encoder self.unet.encoder_hid_proj = None diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 5988e7657e13..aa1e0ad9a91c 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -427,32 +427,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents @@ -620,6 +626,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -663,6 +671,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents def prepare_latents( self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None @@ -882,6 +895,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -931,6 +945,9 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. @@ -992,6 +1009,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -1030,9 +1049,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_videos_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index f0c39952fde1..09b76b8a2cd6 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -507,32 +507,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -588,6 +594,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, controlnet_conditioning_scale=1.0, control_guidance_start=0.0, control_guidance_end=1.0, @@ -726,6 +734,11 @@ def check_inputs( if end > 1.0: raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) @@ -910,6 +923,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -974,6 +988,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1060,6 +1077,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, @@ -1111,9 +1130,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 846da6c76d59..8121b2a8b10b 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -500,32 +500,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -581,6 +587,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, controlnet_conditioning_scale=1.0, control_guidance_start=0.0, control_guidance_end=1.0, @@ -713,6 +721,11 @@ def check_inputs( if end > 1.0: raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -924,6 +937,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -986,6 +1000,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1066,6 +1083,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, @@ -1117,9 +1136,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index bc985beae69d..f1a7a1194f96 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -625,32 +625,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -722,6 +728,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, controlnet_conditioning_scale=1.0, control_guidance_start=0.0, control_guidance_end=1.0, @@ -871,6 +879,11 @@ def check_inputs( if end > 1.0: raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -1136,6 +1149,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1221,6 +1235,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1305,6 +1322,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, @@ -1365,9 +1384,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 5165d193dcfd..86eed55e3c2d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -516,32 +516,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -573,6 +579,8 @@ def check_inputs( prompt_embeds=None, negative_prompt_embeds=None, pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, negative_pooled_prompt_embeds=None, controlnet_conditioning_scale=1.0, control_guidance_start=0.0, @@ -734,6 +742,11 @@ def check_inputs( if end > 1.0: raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -962,6 +975,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1043,6 +1057,9 @@ def __call__( weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1153,6 +1170,8 @@ def __call__( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, negative_pooled_prompt_embeds, controlnet_conditioning_scale, control_guidance_start, @@ -1210,9 +1229,9 @@ def __call__( ) # 3.2 Encode ip_adapter_image - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare image @@ -1389,7 +1408,7 @@ def __call__( down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds # predict the noise residual diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index ca6b5165fefb..a489bebbc9e7 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -565,32 +565,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -625,6 +631,8 @@ def check_inputs( negative_prompt_embeds=None, pooled_prompt_embeds=None, negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, controlnet_conditioning_scale=1.0, control_guidance_start=0.0, control_guidance_end=1.0, @@ -795,6 +803,11 @@ def check_inputs( if end > 1.0: raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -1092,6 +1105,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1191,6 +1205,9 @@ def __call__( weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1314,6 +1331,8 @@ def __call__( negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, @@ -1370,9 +1389,9 @@ def __call__( ) # 3.2 Encode ip_adapter_image - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare image and controlnet_conditioning_image @@ -1537,7 +1556,7 @@ def __call__( down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds # predict the noise residual diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index da2f4ba9b6e9..739ed76b4738 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -462,7 +462,6 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index 449b6d88b9de..3a43533aed2c 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -477,7 +477,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index 1ee0e0161db9..a7f02fc7ae66 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -429,7 +429,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index 3c9d744c6dfa..c4b85ad15e40 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -427,7 +427,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index de6ab3891214..169c57ddc2e1 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -246,7 +246,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 509b5ab34bde..02ce12c049d9 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -477,33 +477,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, do_classifier_free_guidance, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -645,6 +650,8 @@ def check_inputs( strength: float, callback_steps: int, prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: @@ -675,6 +682,11 @@ def check_inputs( elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + @property def guidance_scale(self): return self._guidance_scale @@ -707,6 +719,7 @@ def __call__( latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -757,6 +770,9 @@ def __call__( provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -804,7 +820,15 @@ def __call__( ) # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, prompt_embeds, callback_on_step_end_tensor_inputs) + self.check_inputs( + prompt, + strength, + callback_steps, + prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) self._guidance_scale = guidance_scale self._clip_skip = clip_skip self._cross_attention_kwargs = cross_attention_kwargs @@ -818,11 +842,10 @@ def __call__( batch_size = prompt_embeds.shape[0] device = self._execution_device - # do_classifier_free_guidance = guidance_scale > 1.0 - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, False, device, batch_size * num_images_per_prompt ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 077de49cdc87..4db6993dcf5c 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -649,6 +649,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -687,33 +689,44 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents @@ -998,6 +1011,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, motion_scale: int = 0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -1050,6 +1064,9 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. motion_scale: (`int`, *optional*, defaults to 0): Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific ranges of values control the type of motion that is added. Must be between 0 and 8. @@ -1099,6 +1116,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -1137,9 +1156,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_videos_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index 19bd1f16152c..a1cb3f5af378 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -136,7 +136,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 46b834f9ce82..7c3f1273a107 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -514,32 +514,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds def run_safety_checker(self, image, device, dtype): @@ -593,6 +599,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -636,6 +644,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -818,6 +831,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -869,6 +883,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -934,6 +951,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -976,9 +995,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare timesteps @@ -1001,7 +1020,11 @@ def __call__( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 6.1 Add image embeds for IP-Adapter - added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) + else None + ) # 6.2 Optionally get Guidance Scale Embedding timestep_cond = None diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 440a972ff8e0..0aabdb9b58f7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -408,7 +408,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index a9b04b493c7e..f05e63c68e0c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -529,32 +529,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -610,6 +616,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: @@ -653,6 +661,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) @@ -884,6 +897,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -938,6 +952,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -992,6 +1009,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -1031,9 +1050,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 111a70aa5c09..e5ea92d793bd 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -601,32 +601,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -675,6 +681,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, padding_mask_crop=None, ): @@ -735,6 +743,11 @@ def check_inputs( if output_type != "pil": raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def prepare_latents( self, batch_size, @@ -1031,6 +1044,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1107,6 +1121,9 @@ def __call__( Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1199,6 +1216,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, padding_mask_crop, ) @@ -1239,9 +1258,9 @@ def __call__( if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. set timesteps diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index 2c172ce46e45..b7fd8416d32a 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -474,7 +474,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, @@ -484,6 +483,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -527,6 +528,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index 71b2ef27e07d..6cbd832705bf 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -386,7 +386,6 @@ def decode_latents(self, latents): image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 5b3aae13f4f5..aa925659559a 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -438,33 +438,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, do_classifier_free_guidance, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds def run_safety_checker(self, image, device, dtype): @@ -510,6 +515,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -553,6 +560,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -587,6 +599,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -633,6 +646,9 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -665,7 +681,15 @@ def __call__( # 1. Check inputs. Raise error if not correct self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, ) # 2. Define call parameters @@ -682,9 +706,13 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + do_classifier_free_guidance, + device, + batch_size * num_images_per_prompt, ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index edf93839de05..a422bcec8b35 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -397,32 +397,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -493,6 +499,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -536,6 +544,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -592,6 +605,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -643,6 +657,9 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -680,7 +697,15 @@ def __call__( # 1. Check inputs. Raise error if not correct self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, ) # 2. Define call parameters @@ -697,9 +722,9 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 7d5bc28cb88c..d72698cdc6a3 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -361,7 +361,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index 4a253643aab2..e9448489a1b8 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -462,7 +462,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 79f0aa379ae8..10f5c6e829de 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -550,32 +550,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -609,6 +615,8 @@ def check_inputs( negative_prompt_embeds=None, pooled_prompt_embeds=None, negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -675,6 +683,11 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -905,6 +918,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -994,6 +1008,9 @@ def __call__( weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1092,6 +1109,8 @@ def __call__( negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -1191,9 +1210,9 @@ def __call__( add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 8. Denoising loop @@ -1236,7 +1255,7 @@ def __call__( # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds noise_pred = self.unet( latent_model_input, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 76416a6d331d..c00d79a3a54a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -575,6 +575,8 @@ def check_inputs( negative_prompt_2=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: @@ -637,6 +639,11 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): # get the original timestep using init_timestep if denoising_start is None: @@ -767,32 +774,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds def _get_add_time_ids( @@ -1047,6 +1060,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1145,6 +1159,9 @@ def __call__( weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1245,6 +1262,8 @@ def __call__( negative_prompt_2, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) @@ -1365,9 +1384,9 @@ def denoising_value_valid(dnv): add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 9. Denoising loop @@ -1416,7 +1435,7 @@ def denoising_value_valid(dnv): # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds noise_pred = self.unet( latent_model_input, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 248c990b2cdd..e8d728e3973f 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -488,32 +488,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt @@ -784,6 +790,8 @@ def check_inputs( negative_prompt_2=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, padding_mask_crop=None, ): @@ -856,6 +864,11 @@ def check_inputs( if output_type != "pil": raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + def prepare_latents( self, batch_size, @@ -1288,6 +1301,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1397,6 +1411,9 @@ def __call__( weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): @@ -1512,6 +1529,8 @@ def __call__( negative_prompt_2, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, padding_mask_crop, ) @@ -1713,9 +1732,9 @@ def denoising_value_valid(dnv): add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 11. Denoising loop @@ -1766,7 +1785,7 @@ def denoising_value_valid(dnv): # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds noise_pred = self.unet( latent_model_input, diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 1e97ce4da43e..4181b244135e 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -564,32 +564,38 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt): - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] - image_embeds = [] - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) - single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0) + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) - if self.do_classifier_free_guidance: - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) - single_image_embeds = single_image_embeds.to(device) + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) - image_embeds.append(single_image_embeds) + if self.do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + image_embeds.append(single_image_embeds) + else: + image_embeds = ip_adapter_image_embeds return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -624,6 +630,8 @@ def check_inputs( negative_prompt_embeds=None, pooled_prompt_embeds=None, negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -690,6 +698,11 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -867,6 +880,7 @@ def __call__( pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -959,6 +973,9 @@ def __call__( weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1060,6 +1077,8 @@ def __call__( negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, ) self._guidance_scale = guidance_scale @@ -1096,9 +1115,9 @@ def __call__( ) # 3.2 Encode ip_adapter_image - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, device, batch_size * num_images_per_prompt + ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt ) # 4. Prepare timesteps @@ -1199,7 +1218,7 @@ def __call__( added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds # predict the noise residual diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 6e5db85c9e66..984d856f3c32 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -418,7 +418,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 342a81b81a2e..cb0271c5bc30 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -495,7 +495,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 64bdb476fe2d..4e85e5f07ca3 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -447,7 +447,6 @@ def backward_loop( callback(step_idx, t, latents) return latents.clone().detach() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py index c31fa4f90cea..5b685bc8d96b 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py @@ -510,7 +510,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs def check_inputs( self, prompt, From 491a933a1bf79d1f9cd3bc5903fc609ae6d6a9ac Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 8 Feb 2024 12:30:14 +0530 Subject: [PATCH 28/43] [I2VGenXL] `attention_head_dim` in the UNet (#6872) * attention_head_dim * debug * print more info * correct num_attention_heads behaviour * down_block_num_attention_heads -> num_attention_heads. * correct the image link in doc. * add: deprecation for num_attention_head * fix: test argument to use attention_head_dim * more fixes. * quality * address comments. * remove depcrecation. --- src/diffusers/models/attention.py | 1 + src/diffusers/models/unets/unet_i2vgen_xl.py | 12 +++++++++++- .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 2 +- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 3 ++- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index d4d611250ad0..f9d83afbd2ed 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -158,6 +158,7 @@ def __init__( super().__init__() self.only_cross_attention = only_cross_attention + # We keep these boolean flags for backward-compatibility. self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" self.use_ada_layer_norm_single = norm_type == "ada_norm_single" diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index de4acb7e0d07..eb8c0b50a617 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -120,6 +120,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. If `None`, normalization and activation layers is skipped in post-processing. cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features. + attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim. num_attention_heads (`int`, *optional*): The number of attention heads. """ @@ -147,10 +148,19 @@ def __init__( layers_per_block: int = 2, norm_num_groups: Optional[int] = 32, cross_attention_dim: int = 1024, - num_attention_heads: Optional[Union[int, Tuple[int]]] = 64, + attention_head_dim: Union[int, Tuple[int]] = 64, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, ): super().__init__() + # When we first integrated the UNet into the library, we didn't have `attention_head_dim`. As a consequence + # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This + # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below. + # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it + # without running proper depcrecation cycles for the {down,mid,up} blocks which are a + # part of the public API. + num_attention_heads = attention_head_dim + # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index 5988957cb10f..4f6ce85aaa85 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -46,7 +46,7 @@ >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") >>> pipeline.enable_model_cpu_offload() - >>> image_url = "https://github.com/ali-vilab/i2vgen-xl/blob/main/data/test_images/img_0009.png?raw=true" + >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" >>> image = load_image(image_url).convert("RGB") >>> prompt = "Papers were floating in the air on a table in the library" diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index acd9f9140d0c..de8e2e331099 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -80,7 +80,8 @@ def get_dummy_components(self): down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"), up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), cross_attention_dim=4, - num_attention_heads=4, + attention_head_dim=4, + num_attention_heads=None, norm_num_groups=2, ) From 17808a091e2d5615c2ed8a63d7ae6f2baea11e1e Mon Sep 17 00:00:00 2001 From: Ehsan Akhgari Date: Thu, 8 Feb 2024 08:22:11 -0500 Subject: [PATCH 29/43] Fix bug when converting checkpoint to diffusers format (#6900) This fixes #6899. --- src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 1ba06f811a02..f966f9794d3b 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -1320,6 +1320,9 @@ def download_from_original_stable_diffusion_ckpt( else: with open(original_config_file, "r") as f: original_config_file = f.read() + else: + with open(original_config_file, "r") as f: + original_config_file = f.read() original_config = yaml.safe_load(original_config_file) From 3ac235779484a6410cf0beaa5b57c4fd495bad86 Mon Sep 17 00:00:00 2001 From: Patryk Bartkowiak Date: Thu, 8 Feb 2024 17:09:03 +0100 Subject: [PATCH 30/43] changed positional parameters to named parameters like in docs (#6905) Co-authored-by: Patryk Bartkowiak Co-authored-by: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> --- scripts/convert_diffusers_sdxl_lora_to_webui.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/convert_diffusers_sdxl_lora_to_webui.py b/scripts/convert_diffusers_sdxl_lora_to_webui.py index c054226fa286..dfb3871275cb 100644 --- a/scripts/convert_diffusers_sdxl_lora_to_webui.py +++ b/scripts/convert_diffusers_sdxl_lora_to_webui.py @@ -39,14 +39,15 @@ def convert_and_save(input_lora, output_lora=None): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Convert LoRA model to PEFT and then to Kohya format.") parser.add_argument( - "input_lora", + "--input_lora", type=str, + required=True, help="Path to the input LoRA model file in the diffusers format.", ) parser.add_argument( - "output_lora", + "--output_lora", type=str, - nargs="?", + required=False, help="Path for the converted LoRA (safetensors format for AUTOMATIC1111, ComfyUI, etc.). Optional, defaults to input name with a _webui suffix.", ) From 8de78001df95a641bf6ef942bee9553921d44490 Mon Sep 17 00:00:00 2001 From: Masamune Ishihara <1396267+masaishi@users.noreply.github.com> Date: Thu, 8 Feb 2024 08:29:51 -0800 Subject: [PATCH 31/43] Add fps argument to export_to_gif function. (#6786) --- src/diffusers/utils/export_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/utils/export_utils.py b/src/diffusers/utils/export_utils.py index 4cecc2bfdfce..24cce07ab866 100644 --- a/src/diffusers/utils/export_utils.py +++ b/src/diffusers/utils/export_utils.py @@ -28,7 +28,7 @@ def buffered_writer(raw_f): f.flush() -def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str: +def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None, fps: int = 10) -> str: if output_gif_path is None: output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name @@ -37,7 +37,7 @@ def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> save_all=True, append_images=image[1:], optimize=False, - duration=100, + duration=1000 // fps, loop=0, ) return output_gif_path From 30e5e81d58eb9c3979c07e6626bae89c1df8c0e1 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 8 Feb 2024 23:49:31 +0530 Subject: [PATCH 32/43] change to 2024 in the license (#6902) change to 2024 --- CONTRIBUTING.md | 2 +- PHILOSOPHY.md | 2 +- docs/README.md | 2 +- docs/TRANSLATING.md | 2 +- docs/source/en/api/activations.md | 2 +- docs/source/en/api/attnprocessor.md | 2 +- docs/source/en/api/configuration.md | 2 +- docs/source/en/api/image_processor.md | 2 +- docs/source/en/api/internal_classes_overview.md | 2 +- docs/source/en/api/loaders/ip_adapter.md | 2 +- docs/source/en/api/loaders/lora.md | 2 +- docs/source/en/api/loaders/peft.md | 2 +- docs/source/en/api/loaders/single_file.md | 2 +- docs/source/en/api/loaders/textual_inversion.md | 2 +- docs/source/en/api/loaders/unet.md | 2 +- docs/source/en/api/logging.md | 2 +- docs/source/en/api/models/asymmetricautoencoderkl.md | 2 +- docs/source/en/api/models/autoencoder_tiny.md | 2 +- docs/source/en/api/models/autoencoderkl.md | 2 +- docs/source/en/api/models/controlnet.md | 2 +- docs/source/en/api/models/overview.md | 2 +- docs/source/en/api/models/prior_transformer.md | 2 +- docs/source/en/api/models/transformer2d.md | 2 +- docs/source/en/api/models/transformer_temporal.md | 2 +- docs/source/en/api/models/unet-motion.md | 2 +- docs/source/en/api/models/unet.md | 2 +- docs/source/en/api/models/unet2d-cond.md | 2 +- docs/source/en/api/models/unet2d.md | 2 +- docs/source/en/api/models/unet3d-cond.md | 2 +- docs/source/en/api/models/vq.md | 2 +- docs/source/en/api/normalization.md | 2 +- docs/source/en/api/outputs.md | 2 +- docs/source/en/api/pipelines/amused.md | 2 +- docs/source/en/api/pipelines/animatediff.md | 2 +- docs/source/en/api/pipelines/attend_and_excite.md | 2 +- docs/source/en/api/pipelines/audioldm.md | 2 +- docs/source/en/api/pipelines/audioldm2.md | 2 +- docs/source/en/api/pipelines/auto_pipeline.md | 2 +- docs/source/en/api/pipelines/blip_diffusion.md | 2 +- docs/source/en/api/pipelines/consistency_models.md | 2 +- docs/source/en/api/pipelines/controlnet.md | 2 +- docs/source/en/api/pipelines/controlnet_sdxl.md | 2 +- docs/source/en/api/pipelines/dance_diffusion.md | 2 +- docs/source/en/api/pipelines/ddim.md | 2 +- docs/source/en/api/pipelines/ddpm.md | 2 +- docs/source/en/api/pipelines/deepfloyd_if.md | 2 +- docs/source/en/api/pipelines/diffedit.md | 2 +- docs/source/en/api/pipelines/dit.md | 2 +- docs/source/en/api/pipelines/i2vgenxl.md | 2 +- docs/source/en/api/pipelines/kandinsky.md | 2 +- docs/source/en/api/pipelines/kandinsky3.md | 2 +- docs/source/en/api/pipelines/kandinsky_v22.md | 2 +- docs/source/en/api/pipelines/latent_consistency_models.md | 2 +- docs/source/en/api/pipelines/latent_diffusion.md | 2 +- docs/source/en/api/pipelines/musicldm.md | 2 +- docs/source/en/api/pipelines/overview.md | 2 +- docs/source/en/api/pipelines/paint_by_example.md | 2 +- docs/source/en/api/pipelines/panorama.md | 2 +- docs/source/en/api/pipelines/pia.md | 2 +- docs/source/en/api/pipelines/pix2pix.md | 2 +- docs/source/en/api/pipelines/pixart.md | 2 +- docs/source/en/api/pipelines/self_attention_guidance.md | 2 +- docs/source/en/api/pipelines/semantic_stable_diffusion.md | 2 +- docs/source/en/api/pipelines/shap_e.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/adapter.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/depth2img.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/gligen.md | 2 +- .../en/api/pipelines/stable_diffusion/image_variation.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/img2img.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/inpaint.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md | 2 +- .../en/api/pipelines/stable_diffusion/latent_upscale.md | 2 +- .../en/api/pipelines/stable_diffusion/ldm3d_diffusion.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/overview.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md | 2 +- .../en/api/pipelines/stable_diffusion/stable_diffusion_2.md | 2 +- .../api/pipelines/stable_diffusion/stable_diffusion_safe.md | 2 +- .../en/api/pipelines/stable_diffusion/stable_diffusion_xl.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/text2img.md | 2 +- docs/source/en/api/pipelines/stable_diffusion/upscale.md | 2 +- docs/source/en/api/pipelines/stable_unclip.md | 2 +- docs/source/en/api/pipelines/text_to_video.md | 2 +- docs/source/en/api/pipelines/text_to_video_zero.md | 2 +- docs/source/en/api/pipelines/unclip.md | 2 +- docs/source/en/api/pipelines/unidiffuser.md | 2 +- docs/source/en/api/pipelines/value_guided_sampling.md | 2 +- docs/source/en/api/pipelines/wuerstchen.md | 2 +- docs/source/en/api/schedulers/cm_stochastic_iterative.md | 2 +- docs/source/en/api/schedulers/ddim.md | 2 +- docs/source/en/api/schedulers/ddim_inverse.md | 2 +- docs/source/en/api/schedulers/ddpm.md | 2 +- docs/source/en/api/schedulers/deis.md | 2 +- docs/source/en/api/schedulers/dpm_discrete.md | 2 +- docs/source/en/api/schedulers/dpm_discrete_ancestral.md | 2 +- docs/source/en/api/schedulers/dpm_sde.md | 2 +- docs/source/en/api/schedulers/euler.md | 2 +- docs/source/en/api/schedulers/euler_ancestral.md | 2 +- docs/source/en/api/schedulers/heun.md | 2 +- docs/source/en/api/schedulers/ipndm.md | 2 +- docs/source/en/api/schedulers/lcm.md | 2 +- docs/source/en/api/schedulers/lms_discrete.md | 2 +- docs/source/en/api/schedulers/multistep_dpm_solver.md | 2 +- docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md | 2 +- docs/source/en/api/schedulers/overview.md | 2 +- docs/source/en/api/schedulers/pndm.md | 2 +- docs/source/en/api/schedulers/repaint.md | 2 +- docs/source/en/api/schedulers/score_sde_ve.md | 2 +- docs/source/en/api/schedulers/score_sde_vp.md | 2 +- docs/source/en/api/schedulers/singlestep_dpm_solver.md | 2 +- docs/source/en/api/schedulers/stochastic_karras_ve.md | 2 +- docs/source/en/api/schedulers/unipc.md | 2 +- docs/source/en/api/schedulers/vq_diffusion.md | 2 +- docs/source/en/api/utilities.md | 2 +- docs/source/en/conceptual/contribution.md | 2 +- docs/source/en/conceptual/ethical_guidelines.md | 2 +- docs/source/en/conceptual/evaluation.md | 2 +- docs/source/en/conceptual/philosophy.md | 2 +- docs/source/en/index.md | 2 +- docs/source/en/installation.md | 2 +- docs/source/en/optimization/coreml.md | 2 +- docs/source/en/optimization/deepcache.md | 2 +- docs/source/en/optimization/fp16.md | 2 +- docs/source/en/optimization/habana.md | 2 +- docs/source/en/optimization/memory.md | 2 +- docs/source/en/optimization/mps.md | 2 +- docs/source/en/optimization/onnx.md | 2 +- docs/source/en/optimization/open_vino.md | 2 +- docs/source/en/optimization/opt_overview.md | 2 +- docs/source/en/optimization/tome.md | 2 +- docs/source/en/optimization/torch2.0.md | 2 +- docs/source/en/optimization/xformers.md | 2 +- docs/source/en/quicktour.md | 2 +- docs/source/en/stable_diffusion.md | 2 +- docs/source/en/training/controlnet.md | 2 +- docs/source/en/training/custom_diffusion.md | 2 +- docs/source/en/training/ddpo.md | 2 +- docs/source/en/training/distributed_inference.md | 2 +- docs/source/en/training/dreambooth.md | 2 +- docs/source/en/training/instructpix2pix.md | 2 +- docs/source/en/training/kandinsky.md | 2 +- docs/source/en/training/lcm_distill.md | 2 +- docs/source/en/training/lora.md | 2 +- docs/source/en/training/overview.md | 2 +- docs/source/en/training/sdxl.md | 2 +- docs/source/en/training/t2i_adapters.md | 2 +- docs/source/en/training/text2image.md | 2 +- docs/source/en/training/text_inversion.md | 2 +- docs/source/en/training/unconditional_training.md | 2 +- docs/source/en/training/wuerstchen.md | 2 +- docs/source/en/tutorials/autopipeline.md | 2 +- docs/source/en/tutorials/basic_training.md | 2 +- docs/source/en/tutorials/fast_diffusion.md | 2 +- docs/source/en/tutorials/tutorial_overview.md | 2 +- docs/source/en/tutorials/using_peft_for_inference.md | 2 +- docs/source/en/using-diffusers/callback.md | 2 +- .../source/en/using-diffusers/conditional_image_generation.md | 2 +- docs/source/en/using-diffusers/contribute_pipeline.md | 2 +- docs/source/en/using-diffusers/control_brightness.md | 2 +- docs/source/en/using-diffusers/controlling_generation.md | 2 +- docs/source/en/using-diffusers/controlnet.md | 2 +- docs/source/en/using-diffusers/custom_pipeline_examples.md | 2 +- docs/source/en/using-diffusers/custom_pipeline_overview.md | 2 +- docs/source/en/using-diffusers/depth2img.md | 2 +- docs/source/en/using-diffusers/diffedit.md | 2 +- docs/source/en/using-diffusers/distilled_sd.md | 2 +- docs/source/en/using-diffusers/freeu.md | 2 +- docs/source/en/using-diffusers/img2img.md | 2 +- docs/source/en/using-diffusers/inference_with_lcm.md | 2 +- docs/source/en/using-diffusers/inference_with_lcm_lora.md | 2 +- docs/source/en/using-diffusers/inpaint.md | 2 +- docs/source/en/using-diffusers/kandinsky.md | 2 +- docs/source/en/using-diffusers/loading.md | 2 +- docs/source/en/using-diffusers/loading_adapters.md | 2 +- docs/source/en/using-diffusers/loading_overview.md | 2 +- docs/source/en/using-diffusers/other-formats.md | 2 +- docs/source/en/using-diffusers/other-modalities.md | 2 +- docs/source/en/using-diffusers/pipeline_overview.md | 2 +- docs/source/en/using-diffusers/push_to_hub.md | 2 +- docs/source/en/using-diffusers/reproducibility.md | 2 +- docs/source/en/using-diffusers/reusing_seeds.md | 2 +- docs/source/en/using-diffusers/schedulers.md | 2 +- docs/source/en/using-diffusers/sdxl.md | 2 +- docs/source/en/using-diffusers/sdxl_turbo.md | 2 +- docs/source/en/using-diffusers/shap-e.md | 2 +- docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md | 2 +- docs/source/en/using-diffusers/svd.md | 2 +- docs/source/en/using-diffusers/textual_inversion_inference.md | 2 +- .../en/using-diffusers/unconditional_image_generation.md | 2 +- docs/source/en/using-diffusers/using_safetensors.md | 2 +- docs/source/en/using-diffusers/weighted_prompts.md | 2 +- docs/source/en/using-diffusers/write_own_pipeline.md | 2 +- docs/source/ja/index.md | 2 +- docs/source/ja/installation.md | 2 +- docs/source/ja/quicktour.md | 2 +- docs/source/ja/stable_diffusion.md | 2 +- docs/source/ja/tutorials/autopipeline.md | 2 +- docs/source/ja/tutorials/tutorial_overview.md | 2 +- .../ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md | 2 +- docs/source/ko/in_translation.md | 2 +- docs/source/ko/index.md | 2 +- docs/source/ko/installation.md | 2 +- docs/source/ko/optimization/coreml.md | 2 +- docs/source/ko/optimization/fp16.md | 2 +- docs/source/ko/optimization/habana.md | 2 +- docs/source/ko/optimization/mps.md | 2 +- docs/source/ko/optimization/onnx.md | 2 +- docs/source/ko/optimization/open_vino.md | 2 +- docs/source/ko/optimization/opt_overview.md | 2 +- docs/source/ko/optimization/tome.md | 2 +- docs/source/ko/optimization/torch2.0.md | 2 +- docs/source/ko/optimization/xformers.md | 2 +- docs/source/ko/quicktour.md | 2 +- docs/source/ko/stable_diffusion.md | 2 +- docs/source/ko/training/adapt_a_model.md | 2 +- docs/source/ko/training/controlnet.md | 2 +- docs/source/ko/training/custom_diffusion.md | 2 +- docs/source/ko/training/dreambooth.md | 2 +- docs/source/ko/training/instructpix2pix.md | 2 +- docs/source/ko/training/lora.md | 2 +- docs/source/ko/training/overview.md | 2 +- docs/source/ko/training/text2image.md | 2 +- docs/source/ko/training/text_inversion.md | 2 +- docs/source/ko/training/unconditional_training.md | 2 +- docs/source/ko/tutorials/basic_training.md | 2 +- docs/source/ko/tutorials/tutorial_overview.md | 2 +- .../source/ko/using-diffusers/conditional_image_generation.md | 2 +- docs/source/ko/using-diffusers/contribute_pipeline.md | 2 +- docs/source/ko/using-diffusers/controlling_generation.md | 2 +- docs/source/ko/using-diffusers/custom_pipeline_examples.md | 2 +- docs/source/ko/using-diffusers/custom_pipeline_overview.md | 2 +- docs/source/ko/using-diffusers/depth2img.md | 2 +- docs/source/ko/using-diffusers/img2img.md | 2 +- docs/source/ko/using-diffusers/inpaint.md | 2 +- docs/source/ko/using-diffusers/loading.md | 2 +- docs/source/ko/using-diffusers/loading_overview.md | 2 +- docs/source/ko/using-diffusers/other-formats.md | 2 +- docs/source/ko/using-diffusers/pipeline_overview.md | 2 +- docs/source/ko/using-diffusers/reproducibility.md | 2 +- docs/source/ko/using-diffusers/reusing_seeds.md | 2 +- docs/source/ko/using-diffusers/schedulers.md | 2 +- docs/source/ko/using-diffusers/stable_diffusion_jax_how_to.md | 2 +- .../ko/using-diffusers/unconditional_image_generation.md | 2 +- docs/source/ko/using-diffusers/weighted_prompts.md | 2 +- docs/source/ko/using-diffusers/write_own_pipeline.md | 2 +- docs/source/pt/index.md | 2 +- docs/source/pt/installation.md | 2 +- docs/source/pt/quicktour.md | 2 +- docs/source/zh/index.md | 2 +- docs/source/zh/installation.md | 2 +- docs/source/zh/quicktour.md | 2 +- docs/source/zh/stable_diffusion.md | 2 +- examples/README.md | 2 +- .../train_dreambooth_lora_sd15_advanced.py | 2 +- .../train_dreambooth_lora_sdxl_advanced.py | 2 +- examples/amused/train_amused.py | 2 +- examples/community/composable_stable_diffusion.py | 2 +- examples/community/dps_pipeline.py | 2 +- examples/community/instaflow_one_step.py | 2 +- examples/community/ip_adapter_face_id.py | 2 +- examples/community/latent_consistency_img2img.py | 2 +- examples/community/latent_consistency_txt2img.py | 2 +- examples/community/llm_grounded_diffusion.py | 2 +- examples/community/marigold_depth_estimation.py | 2 +- examples/community/pipeline_animatediff_controlnet.py | 2 +- examples/community/pipeline_animatediff_img2video.py | 2 +- examples/community/pipeline_fabric.py | 2 +- examples/community/pipeline_prompt2prompt.py | 2 +- examples/community/pipeline_stable_diffusion_upscale_ldm3d.py | 2 +- .../pipeline_stable_diffusion_xl_controlnet_adapter.py | 2 +- ...pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py | 2 +- examples/community/rerender_a_video.py | 2 +- examples/community/scheduling_ufogen.py | 2 +- examples/community/sd_text2img_k_diffusion.py | 2 +- examples/community/stable_diffusion_ipex.py | 2 +- examples/community/stable_diffusion_repaint.py | 2 +- examples/community/stable_diffusion_tensorrt_img2img.py | 2 +- examples/community/stable_diffusion_tensorrt_inpaint.py | 2 +- examples/community/stable_diffusion_tensorrt_txt2img.py | 2 +- examples/community/tiled_upscaling.py | 2 +- examples/conftest.py | 2 +- examples/consistency_distillation/test_lcm_lora.py | 2 +- .../consistency_distillation/train_lcm_distill_lora_sd_wds.py | 2 +- .../consistency_distillation/train_lcm_distill_lora_sdxl.py | 2 +- .../train_lcm_distill_lora_sdxl_wds.py | 2 +- examples/consistency_distillation/train_lcm_distill_sd_wds.py | 2 +- .../consistency_distillation/train_lcm_distill_sdxl_wds.py | 2 +- examples/controlnet/test_controlnet.py | 2 +- examples/controlnet/train_controlnet.py | 2 +- examples/controlnet/train_controlnet_flax.py | 2 +- examples/controlnet/train_controlnet_sdxl.py | 2 +- examples/custom_diffusion/retrieve.py | 2 +- examples/custom_diffusion/test_custom_diffusion.py | 2 +- examples/custom_diffusion/train_custom_diffusion.py | 2 +- examples/dreambooth/test_dreambooth.py | 2 +- examples/dreambooth/test_dreambooth_lora.py | 2 +- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- examples/dreambooth/train_dreambooth_lora_sdxl.py | 2 +- examples/instruct_pix2pix/test_instruct_pix2pix.py | 2 +- examples/instruct_pix2pix/train_instruct_pix2pix.py | 2 +- examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py | 2 +- .../kandinsky2_2/text_to_image/train_text_to_image_decoder.py | 2 +- .../text_to_image/train_text_to_image_lora_decoder.py | 2 +- .../text_to_image/train_text_to_image_lora_prior.py | 2 +- .../kandinsky2_2/text_to_image/train_text_to_image_prior.py | 2 +- .../consistency_training/train_cm_ct_unconditional.py | 2 +- .../controlnet/train_controlnet_webdataset.py | 2 +- examples/research_projects/controlnetxs/controlnetxs.py | 2 +- .../research_projects/controlnetxs/pipeline_controlnet_xs.py | 2 +- .../controlnetxs/pipeline_controlnet_xs_sd_xl.py | 2 +- .../instructpix2pix_lora/train_instruct_pix2pix_lora.py | 2 +- examples/research_projects/lora/train_text_to_image_lora.py | 2 +- .../multi_token_textual_inversion/textual_inversion.py | 2 +- .../onnxruntime/text_to_image/train_text_to_image.py | 2 +- .../onnxruntime/textual_inversion/textual_inversion.py | 2 +- examples/t2i_adapter/test_t2i_adapter.py | 2 +- examples/t2i_adapter/train_t2i_adapter_sdxl.py | 2 +- examples/test_examples_utils.py | 2 +- examples/text_to_image/test_text_to_image.py | 2 +- examples/text_to_image/test_text_to_image_lora.py | 2 +- examples/text_to_image/train_text_to_image.py | 2 +- examples/text_to_image/train_text_to_image_lora.py | 2 +- examples/text_to_image/train_text_to_image_lora_sdxl.py | 2 +- examples/text_to_image/train_text_to_image_sdxl.py | 2 +- examples/textual_inversion/test_textual_inversion.py | 2 +- examples/textual_inversion/test_textual_inversion_sdxl.py | 2 +- examples/textual_inversion/textual_inversion.py | 2 +- examples/textual_inversion/textual_inversion_sdxl.py | 2 +- examples/unconditional_image_generation/test_unconditional.py | 2 +- .../text_to_image/train_text_to_image_lora_prior.py | 2 +- .../wuerstchen/text_to_image/train_text_to_image_prior.py | 2 +- scripts/change_naming_configs_and_checkpoints.py | 2 +- scripts/convert_i2vgen_to_diffusers.py | 2 +- scripts/convert_ldm_original_checkpoint_to_diffusers.py | 2 +- scripts/convert_lora_safetensor_to_diffusers.py | 2 +- scripts/convert_ms_text_to_video_to_diffusers.py | 2 +- scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py | 2 +- scripts/convert_original_audioldm2_to_diffusers.py | 2 +- scripts/convert_original_audioldm_to_diffusers.py | 2 +- scripts/convert_original_controlnet_to_diffusers.py | 2 +- scripts/convert_original_musicldm_to_diffusers.py | 2 +- scripts/convert_original_stable_diffusion_to_diffusers.py | 2 +- scripts/convert_original_t2i_adapter.py | 2 +- scripts/convert_stable_diffusion_checkpoint_to_onnx.py | 2 +- scripts/convert_versatile_diffusion_to_diffusers.py | 2 +- setup.py | 2 +- src/diffusers/commands/__init__.py | 2 +- src/diffusers/commands/diffusers_cli.py | 2 +- src/diffusers/commands/env.py | 2 +- src/diffusers/commands/fp16_safetensors.py | 2 +- src/diffusers/configuration_utils.py | 2 +- src/diffusers/dependency_versions_check.py | 2 +- src/diffusers/experimental/rl/value_guided_sampling.py | 2 +- src/diffusers/image_processor.py | 2 +- src/diffusers/loaders/autoencoder.py | 2 +- src/diffusers/loaders/controlnet.py | 2 +- src/diffusers/loaders/ip_adapter.py | 2 +- src/diffusers/loaders/lora.py | 2 +- src/diffusers/loaders/lora_conversion_utils.py | 2 +- src/diffusers/loaders/peft.py | 2 +- src/diffusers/loaders/single_file.py | 2 +- src/diffusers/loaders/single_file_utils.py | 2 +- src/diffusers/loaders/textual_inversion.py | 2 +- src/diffusers/loaders/unet.py | 2 +- src/diffusers/loaders/utils.py | 2 +- src/diffusers/models/__init__.py | 2 +- src/diffusers/models/activations.py | 2 +- src/diffusers/models/attention.py | 2 +- src/diffusers/models/attention_flax.py | 2 +- src/diffusers/models/attention_processor.py | 2 +- src/diffusers/models/autoencoders/autoencoder_asym_kl.py | 2 +- src/diffusers/models/autoencoders/autoencoder_kl.py | 2 +- .../models/autoencoders/autoencoder_kl_temporal_decoder.py | 2 +- src/diffusers/models/autoencoders/autoencoder_tiny.py | 2 +- src/diffusers/models/autoencoders/consistency_decoder_vae.py | 2 +- src/diffusers/models/autoencoders/vae.py | 2 +- src/diffusers/models/controlnet.py | 2 +- src/diffusers/models/controlnet_flax.py | 2 +- src/diffusers/models/downsampling.py | 2 +- src/diffusers/models/dual_transformer_2d.py | 2 +- src/diffusers/models/embeddings.py | 2 +- src/diffusers/models/embeddings_flax.py | 2 +- src/diffusers/models/lora.py | 2 +- src/diffusers/models/modeling_flax_pytorch_utils.py | 2 +- src/diffusers/models/modeling_flax_utils.py | 2 +- src/diffusers/models/modeling_pytorch_flax_utils.py | 2 +- src/diffusers/models/modeling_utils.py | 2 +- src/diffusers/models/normalization.py | 2 +- src/diffusers/models/resnet.py | 4 ++-- src/diffusers/models/resnet_flax.py | 2 +- src/diffusers/models/t5_film_transformer.py | 2 +- src/diffusers/models/transformer_2d.py | 2 +- src/diffusers/models/transformer_temporal.py | 2 +- src/diffusers/models/transformers/dual_transformer_2d.py | 2 +- src/diffusers/models/transformers/t5_film_transformer.py | 2 +- src/diffusers/models/transformers/transformer_2d.py | 2 +- src/diffusers/models/transformers/transformer_temporal.py | 2 +- src/diffusers/models/unet_1d.py | 2 +- src/diffusers/models/unet_1d_blocks.py | 2 +- src/diffusers/models/unet_2d.py | 2 +- src/diffusers/models/unet_2d_blocks.py | 2 +- src/diffusers/models/unet_2d_condition.py | 2 +- src/diffusers/models/unets/unet_1d.py | 2 +- src/diffusers/models/unets/unet_1d_blocks.py | 2 +- src/diffusers/models/unets/unet_2d.py | 2 +- src/diffusers/models/unets/unet_2d_blocks.py | 2 +- src/diffusers/models/unets/unet_2d_blocks_flax.py | 2 +- src/diffusers/models/unets/unet_2d_condition.py | 2 +- src/diffusers/models/unets/unet_2d_condition_flax.py | 2 +- src/diffusers/models/unets/unet_3d_blocks.py | 2 +- src/diffusers/models/unets/unet_3d_condition.py | 4 ++-- src/diffusers/models/unets/unet_i2vgen_xl.py | 2 +- src/diffusers/models/unets/unet_kandinsky3.py | 2 +- src/diffusers/models/unets/unet_motion_model.py | 2 +- src/diffusers/models/unets/uvit_2d.py | 2 +- src/diffusers/models/upsampling.py | 2 +- src/diffusers/models/vae_flax.py | 2 +- src/diffusers/models/vq_model.py | 2 +- src/diffusers/optimization.py | 2 +- src/diffusers/pipelines/amused/pipeline_amused.py | 2 +- src/diffusers/pipelines/amused/pipeline_amused_img2img.py | 2 +- src/diffusers/pipelines/amused/pipeline_amused_inpaint.py | 2 +- src/diffusers/pipelines/animatediff/pipeline_animatediff.py | 2 +- .../pipelines/animatediff/pipeline_animatediff_video2video.py | 2 +- src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 2 +- src/diffusers/pipelines/audioldm2/modeling_audioldm2.py | 2 +- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 2 +- src/diffusers/pipelines/auto_pipeline.py | 2 +- .../pipelines/blip_diffusion/blip_image_processing.py | 2 +- src/diffusers/pipelines/blip_diffusion/modeling_blip2.py | 2 +- src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py | 4 ++-- .../pipelines/blip_diffusion/pipeline_blip_diffusion.py | 4 ++-- .../consistency_models/pipeline_consistency_models.py | 2 +- src/diffusers/pipelines/controlnet/pipeline_controlnet.py | 2 +- .../controlnet/pipeline_controlnet_blip_diffusion.py | 4 ++-- .../pipelines/controlnet/pipeline_controlnet_img2img.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_sd_xl.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py | 2 +- .../pipelines/controlnet/pipeline_flax_controlnet.py | 2 +- .../pipelines/dance_diffusion/pipeline_dance_diffusion.py | 2 +- src/diffusers/pipelines/ddim/pipeline_ddim.py | 2 +- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 2 +- .../deprecated/alt_diffusion/pipeline_alt_diffusion.py | 2 +- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 2 +- src/diffusers/pipelines/deprecated/audio_diffusion/mel.py | 2 +- .../deprecated/audio_diffusion/pipeline_audio_diffusion.py | 2 +- .../pipeline_latent_diffusion_uncond.py | 2 +- src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py | 2 +- .../pipelines/deprecated/repaint/pipeline_repaint.py | 2 +- .../deprecated/score_sde_ve/pipeline_score_sde_ve.py | 2 +- .../deprecated/spectrogram_diffusion/continuous_encoder.py | 2 +- .../pipelines/deprecated/spectrogram_diffusion/midi_utils.py | 2 +- .../deprecated/spectrogram_diffusion/notes_encoder.py | 2 +- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 2 +- .../stable_diffusion_variants/pipeline_cycle_diffusion.py | 2 +- .../pipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../pipeline_stable_diffusion_model_editing.py | 2 +- .../pipeline_stable_diffusion_paradigms.py | 2 +- .../pipeline_stable_diffusion_pix2pix_zero.py | 2 +- .../stochastic_karras_ve/pipeline_stochastic_karras_ve.py | 2 +- .../pipeline_versatile_diffusion_dual_guided.py | 2 +- .../pipeline_versatile_diffusion_image_variation.py | 2 +- .../pipeline_versatile_diffusion_text_to_image.py | 2 +- .../deprecated/vq_diffusion/pipeline_vq_diffusion.py | 2 +- src/diffusers/pipelines/dit/pipeline_dit.py | 2 +- src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 2 +- src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 2 +- .../pipelines/kandinsky/pipeline_kandinsky_combined.py | 2 +- .../pipelines/kandinsky/pipeline_kandinsky_img2img.py | 2 +- .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py | 2 +- src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +- src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py | 2 +- .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py | 2 +- .../kandinsky2_2/pipeline_kandinsky2_2_controlnet.py | 2 +- .../kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py | 2 +- .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py | 2 +- .../kandinsky2_2/pipeline_kandinsky2_2_inpainting.py | 2 +- .../pipeline_latent_consistency_img2img.py | 2 +- .../pipeline_latent_consistency_text2img.py | 2 +- .../pipelines/latent_diffusion/pipeline_latent_diffusion.py | 2 +- src/diffusers/pipelines/musicldm/pipeline_musicldm.py | 2 +- src/diffusers/pipelines/onnx_utils.py | 2 +- src/diffusers/pipelines/paint_by_example/image_encoder.py | 2 +- .../pipelines/paint_by_example/pipeline_paint_by_example.py | 2 +- src/diffusers/pipelines/pia/pipeline_pia.py | 2 +- src/diffusers/pipelines/pipeline_flax_utils.py | 2 +- src/diffusers/pipelines/pipeline_utils.py | 2 +- src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py | 2 +- src/diffusers/pipelines/shap_e/camera.py | 2 +- src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 2 +- src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 2 +- src/diffusers/pipelines/shap_e/renderer.py | 2 +- .../pipelines/stable_diffusion/clip_image_project_model.py | 2 +- src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +- .../stable_diffusion/pipeline_flax_stable_diffusion.py | 2 +- .../pipeline_flax_stable_diffusion_img2img.py | 2 +- .../pipeline_flax_stable_diffusion_inpaint.py | 2 +- .../stable_diffusion/pipeline_onnx_stable_diffusion.py | 2 +- .../pipeline_onnx_stable_diffusion_img2img.py | 2 +- .../pipeline_onnx_stable_diffusion_inpaint.py | 2 +- .../pipeline_onnx_stable_diffusion_upscale.py | 2 +- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_depth2img.py | 2 +- .../pipeline_stable_diffusion_image_variation.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion_instruct_pix2pix.py | 2 +- .../pipeline_stable_diffusion_latent_upscale.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_upscale.py | 2 +- .../pipelines/stable_diffusion/pipeline_stable_unclip.py | 2 +- .../stable_diffusion/pipeline_stable_unclip_img2img.py | 2 +- src/diffusers/pipelines/stable_diffusion/safety_checker.py | 2 +- .../pipelines/stable_diffusion/safety_checker_flax.py | 2 +- .../stable_diffusion/stable_unclip_image_normalizer.py | 2 +- .../pipeline_stable_diffusion_attend_and_excite.py | 2 +- .../pipeline_stable_diffusion_diffedit.py | 2 +- .../pipeline_stable_diffusion_gligen.py | 2 +- .../pipeline_stable_diffusion_gligen_text_image.py | 2 +- .../pipeline_stable_diffusion_k_diffusion.py | 2 +- .../pipeline_stable_diffusion_xl_k_diffusion.py | 2 +- .../stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py | 2 +- .../pipeline_stable_diffusion_panorama.py | 2 +- .../pipelines/stable_diffusion_safe/safety_checker.py | 2 +- .../stable_diffusion_sag/pipeline_stable_diffusion_sag.py | 2 +- .../stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py | 2 +- .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py | 2 +- .../pipeline_stable_diffusion_xl_img2img.py | 2 +- .../pipeline_stable_diffusion_xl_inpaint.py | 2 +- .../pipeline_stable_diffusion_xl_instruct_pix2pix.py | 2 +- .../stable_video_diffusion/pipeline_stable_video_diffusion.py | 2 +- .../t2i_adapter/pipeline_stable_diffusion_adapter.py | 2 +- .../t2i_adapter/pipeline_stable_diffusion_xl_adapter.py | 2 +- .../text_to_video_synthesis/pipeline_text_to_video_synth.py | 2 +- .../pipeline_text_to_video_synth_img2img.py | 2 +- src/diffusers/pipelines/unclip/pipeline_unclip.py | 2 +- .../pipelines/unclip/pipeline_unclip_image_variation.py | 2 +- src/diffusers/pipelines/unclip/text_proj.py | 2 +- .../pipelines/wuerstchen/modeling_paella_vq_model.py | 2 +- .../pipelines/wuerstchen/modeling_wuerstchen_common.py | 2 +- .../pipelines/wuerstchen/modeling_wuerstchen_diffnext.py | 2 +- .../pipelines/wuerstchen/modeling_wuerstchen_prior.py | 2 +- src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py | 2 +- .../pipelines/wuerstchen/pipeline_wuerstchen_combined.py | 2 +- .../pipelines/wuerstchen/pipeline_wuerstchen_prior.py | 2 +- src/diffusers/schedulers/__init__.py | 2 +- src/diffusers/schedulers/deprecated/scheduling_karras_ve.py | 2 +- src/diffusers/schedulers/deprecated/scheduling_sde_vp.py | 2 +- src/diffusers/schedulers/scheduling_consistency_models.py | 2 +- src/diffusers/schedulers/scheduling_ddim.py | 2 +- src/diffusers/schedulers/scheduling_ddim_flax.py | 2 +- src/diffusers/schedulers/scheduling_ddim_inverse.py | 2 +- src/diffusers/schedulers/scheduling_ddim_parallel.py | 2 +- src/diffusers/schedulers/scheduling_ddpm.py | 2 +- src/diffusers/schedulers/scheduling_ddpm_flax.py | 2 +- src/diffusers/schedulers/scheduling_ddpm_parallel.py | 2 +- src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py | 2 +- src/diffusers/schedulers/scheduling_deis_multistep.py | 2 +- src/diffusers/schedulers/scheduling_dpmsolver_multistep.py | 2 +- .../schedulers/scheduling_dpmsolver_multistep_flax.py | 2 +- .../schedulers/scheduling_dpmsolver_multistep_inverse.py | 2 +- src/diffusers/schedulers/scheduling_dpmsolver_sde.py | 2 +- src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py | 2 +- .../schedulers/scheduling_euler_ancestral_discrete.py | 2 +- src/diffusers/schedulers/scheduling_euler_discrete.py | 2 +- src/diffusers/schedulers/scheduling_euler_discrete_flax.py | 2 +- src/diffusers/schedulers/scheduling_heun_discrete.py | 2 +- src/diffusers/schedulers/scheduling_ipndm.py | 2 +- .../schedulers/scheduling_k_dpm_2_ancestral_discrete.py | 2 +- src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py | 2 +- src/diffusers/schedulers/scheduling_karras_ve_flax.py | 2 +- src/diffusers/schedulers/scheduling_lcm.py | 2 +- src/diffusers/schedulers/scheduling_lms_discrete.py | 2 +- src/diffusers/schedulers/scheduling_lms_discrete_flax.py | 2 +- src/diffusers/schedulers/scheduling_pndm.py | 2 +- src/diffusers/schedulers/scheduling_pndm_flax.py | 2 +- src/diffusers/schedulers/scheduling_repaint.py | 2 +- src/diffusers/schedulers/scheduling_sasolver.py | 2 +- src/diffusers/schedulers/scheduling_sde_ve.py | 2 +- src/diffusers/schedulers/scheduling_sde_ve_flax.py | 2 +- src/diffusers/schedulers/scheduling_unclip.py | 2 +- src/diffusers/schedulers/scheduling_unipc_multistep.py | 2 +- src/diffusers/schedulers/scheduling_utils.py | 2 +- src/diffusers/schedulers/scheduling_utils_flax.py | 2 +- src/diffusers/schedulers/scheduling_vq_diffusion.py | 2 +- src/diffusers/utils/__init__.py | 2 +- src/diffusers/utils/accelerate_utils.py | 2 +- src/diffusers/utils/constants.py | 2 +- src/diffusers/utils/doc_utils.py | 2 +- src/diffusers/utils/dynamic_modules_utils.py | 2 +- src/diffusers/utils/hub_utils.py | 2 +- src/diffusers/utils/import_utils.py | 2 +- src/diffusers/utils/logging.py | 2 +- src/diffusers/utils/outputs.py | 2 +- src/diffusers/utils/peft_utils.py | 2 +- src/diffusers/utils/state_dict_utils.py | 2 +- src/diffusers/utils/torch_utils.py | 2 +- tests/conftest.py | 2 +- tests/fixtures/custom_pipeline/pipeline.py | 2 +- tests/fixtures/custom_pipeline/what_ever.py | 2 +- tests/lora/test_lora_layers_old_backend.py | 2 +- tests/lora/test_lora_layers_peft.py | 2 +- tests/lora/test_peft_lora_in_non_peft.py | 2 +- tests/models/autoencoders/test_models_vae.py | 2 +- tests/models/autoencoders/test_models_vq.py | 2 +- tests/models/test_layers_utils.py | 2 +- tests/models/test_modeling_common.py | 2 +- tests/models/transformers/test_models_prior.py | 2 +- tests/models/unets/test_models_unet_1d.py | 2 +- tests/models/unets/test_models_unet_2d.py | 2 +- tests/models/unets/test_models_unet_2d_condition.py | 2 +- tests/models/unets/test_models_unet_3d_condition.py | 2 +- tests/models/unets/test_models_unet_motion.py | 2 +- tests/models/unets/test_models_unet_spatiotemporal.py | 2 +- tests/models/unets/test_unet_2d_blocks.py | 2 +- tests/models/unets/test_unet_blocks_common.py | 2 +- tests/others/test_check_copies.py | 2 +- tests/others/test_check_dummies.py | 2 +- tests/others/test_config.py | 2 +- tests/others/test_dependencies.py | 2 +- tests/others/test_ema.py | 2 +- tests/others/test_hub_utils.py | 2 +- tests/others/test_image_processor.py | 2 +- tests/others/test_training.py | 2 +- tests/others/test_utils.py | 2 +- tests/pipelines/amused/test_amused.py | 2 +- tests/pipelines/amused/test_amused_img2img.py | 2 +- tests/pipelines/amused/test_amused_inpaint.py | 2 +- tests/pipelines/audioldm/test_audioldm.py | 2 +- tests/pipelines/audioldm2/test_audioldm2.py | 2 +- tests/pipelines/blipdiffusion/test_blipdiffusion.py | 2 +- tests/pipelines/controlnet/test_controlnet.py | 2 +- tests/pipelines/controlnet/test_controlnet_blip_diffusion.py | 2 +- tests/pipelines/controlnet/test_controlnet_img2img.py | 2 +- tests/pipelines/controlnet/test_controlnet_inpaint.py | 2 +- tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py | 2 +- tests/pipelines/controlnet/test_controlnet_sdxl.py | 2 +- tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py | 2 +- tests/pipelines/controlnet/test_flax_controlnet.py | 2 +- tests/pipelines/dance_diffusion/test_dance_diffusion.py | 2 +- tests/pipelines/ddim/test_ddim.py | 2 +- tests/pipelines/ddpm/test_ddpm.py | 2 +- tests/pipelines/deepfloyd_if/test_if.py | 2 +- tests/pipelines/deepfloyd_if/test_if_img2img.py | 2 +- .../pipelines/deepfloyd_if/test_if_img2img_superresolution.py | 2 +- tests/pipelines/deepfloyd_if/test_if_inpainting.py | 2 +- .../deepfloyd_if/test_if_inpainting_superresolution.py | 2 +- tests/pipelines/deepfloyd_if/test_if_superresolution.py | 2 +- tests/pipelines/dit/test_dit.py | 2 +- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 2 +- .../pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py | 2 +- tests/pipelines/kandinsky/test_kandinsky.py | 2 +- tests/pipelines/kandinsky/test_kandinsky_combined.py | 2 +- tests/pipelines/kandinsky/test_kandinsky_img2img.py | 2 +- tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 2 +- tests/pipelines/kandinsky/test_kandinsky_prior.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_combined.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py | 2 +- .../kandinsky2_2/test_kandinsky_controlnet_img2img.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_prior.py | 2 +- tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py | 2 +- tests/pipelines/kandinsky3/test_kandinsky3.py | 2 +- tests/pipelines/kandinsky3/test_kandinsky3_img2img.py | 2 +- tests/pipelines/latent_diffusion/test_latent_diffusion.py | 2 +- .../latent_diffusion/test_latent_diffusion_superresolution.py | 2 +- tests/pipelines/musicldm/test_musicldm.py | 2 +- tests/pipelines/paint_by_example/test_paint_by_example.py | 2 +- tests/pipelines/pixart/test_pixart.py | 2 +- tests/pipelines/pndm/test_pndm.py | 2 +- .../semantic_stable_diffusion/test_semantic_diffusion.py | 2 +- tests/pipelines/shap_e/test_shap_e.py | 2 +- tests/pipelines/shap_e/test_shap_e_img2img.py | 2 +- .../pipelines/stable_diffusion/test_onnx_stable_diffusion.py | 2 +- .../stable_diffusion/test_onnx_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/test_onnx_stable_diffusion_inpaint.py | 2 +- tests/pipelines/stable_diffusion/test_stable_diffusion.py | 2 +- .../stable_diffusion/test_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/test_stable_diffusion_inpaint.py | 2 +- .../test_stable_diffusion_instruction_pix2pix.py | 2 +- tests/pipelines/stable_diffusion_2/test_stable_diffusion.py | 2 +- .../test_stable_diffusion_attend_and_excite.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_depth.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_diffedit.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_flax.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_flax_inpaint.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_inpaint.py | 2 +- .../test_stable_diffusion_latent_upscale.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_upscale.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_v_pred.py | 2 +- .../stable_diffusion_gligen/test_stable_diffusion_gligen.py | 2 +- .../test_stable_diffusion_gligen_text_image.py | 2 +- .../test_stable_diffusion_image_variation.py | 2 +- .../test_stable_diffusion_k_diffusion.py | 2 +- .../stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py | 2 +- .../test_stable_diffusion_panorama.py | 2 +- tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py | 2 +- .../stable_diffusion_sag/test_stable_diffusion_sag.py | 2 +- .../pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py | 2 +- .../stable_diffusion_xl/test_stable_diffusion_xl_adapter.py | 2 +- .../stable_diffusion_xl/test_stable_diffusion_xl_img2img.py | 2 +- .../stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py | 2 +- .../test_stable_diffusion_xl_instruction_pix2pix.py | 2 +- .../test_stable_diffusion_xl_k_diffusion.py | 2 +- tests/pipelines/test_pipelines.py | 2 +- tests/pipelines/test_pipelines_auto.py | 2 +- tests/pipelines/test_pipelines_combined.py | 2 +- tests/pipelines/test_pipelines_flax.py | 2 +- tests/pipelines/text_to_video_synthesis/test_text_to_video.py | 2 +- .../text_to_video_synthesis/test_text_to_video_zero.py | 2 +- .../text_to_video_synthesis/test_text_to_video_zero_sdxl.py | 2 +- .../pipelines/text_to_video_synthesis/test_video_to_video.py | 2 +- tests/pipelines/unclip/test_unclip.py | 2 +- tests/pipelines/unclip/test_unclip_image_variation.py | 2 +- tests/pipelines/wuerstchen/test_wuerstchen_combined.py | 2 +- tests/pipelines/wuerstchen/test_wuerstchen_decoder.py | 2 +- tests/pipelines/wuerstchen/test_wuerstchen_prior.py | 2 +- tests/schedulers/test_scheduler_ddim_parallel.py | 2 +- tests/schedulers/test_scheduler_ddpm_parallel.py | 2 +- tests/schedulers/test_scheduler_flax.py | 2 +- tests/schedulers/test_schedulers.py | 2 +- utils/check_config_docstrings.py | 2 +- utils/check_copies.py | 2 +- utils/check_doc_toc.py | 2 +- utils/check_dummies.py | 2 +- utils/check_inits.py | 2 +- utils/check_repo.py | 2 +- utils/check_table.py | 2 +- utils/custom_init_isort.py | 2 +- utils/get_modified_files.py | 2 +- utils/overwrite_expected_slice.py | 2 +- utils/print_env.py | 2 +- utils/stale.py | 2 +- 736 files changed, 741 insertions(+), 741 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d24b049d3b39..887e4dd43c45 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -