diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml index c7a6ea4fb7c7..32ffb120d12b 100644 --- a/.github/workflows/pr_test_peft_backend.yml +++ b/.github/workflows/pr_test_peft_backend.yml @@ -32,9 +32,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality - name: Check if failure if: ${{ failure() }} run: | @@ -53,7 +51,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 7ec4ffa713b8..460c405c1fd4 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -40,9 +40,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality - name: Check if failure if: ${{ failure() }} run: | @@ -61,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py diff --git a/Makefile b/Makefile index c92285b48c71..9af2e8b1a5c9 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ repo-consistency: quality: ruff check $(check_dirs) setup.py ruff format --check $(check_dirs) setup.py + doc-builder style src/diffusers docs/source --max_len 119 --check_only python utils/check_doc_toc.py # Format source code automatically and check is there are any problems left that need manual fixing @@ -55,6 +56,7 @@ extra_style_checks: style: ruff check $(check_dirs) setup.py --fix ruff format $(check_dirs) setup.py + doc-builder style src/diffusers docs/source --max_len 119 ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/setup.py b/setup.py index bbf8ecfde174..91cb88398a28 100644 --- a/setup.py +++ b/setup.py @@ -134,6 +134,7 @@ "torchvision", "transformers>=4.25.1", "urllib3<=2.0.0", + "black", ] # this is a lookup table with items like: diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index e92a486bffc1..c542d51fb3f2 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -42,4 +42,5 @@ "torchvision": "torchvision", "transformers": "transformers>=4.25.1", "urllib3": "urllib3<=2.0.0", + "black": "black", } diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index daeb8fd6fa6d..eac3f9b7d578 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -173,8 +173,9 @@ def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image: @staticmethod def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0): """ - Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect ratio of the original image; - for example, if user drew mask in a 128x32 region, and the dimensions for processing are 512x512, the region will be expanded to 128x128. + Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect + ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for + processing are 512x512, the region will be expanded to 128x128. Args: mask_image (PIL.Image.Image): Mask image. @@ -183,7 +184,8 @@ def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0) pad (int, optional): Padding to be added to the crop region. Defaults to 0. Returns: - tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and matches the original aspect ratio. + tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and + matches the original aspect ratio. """ mask_image = mask_image.convert("L") @@ -265,7 +267,8 @@ def _resize_and_fill( height: int, ) -> PIL.Image.Image: """ - Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image. + Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, filling empty with data from image. Args: image: The image to resize. @@ -309,7 +312,8 @@ def _resize_and_crop( height: int, ) -> PIL.Image.Image: """ - Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess. + Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, cropping the excess. Args: image: The image to resize. @@ -346,12 +350,12 @@ def resize( The width to resize to. resize_mode (`str`, *optional*, defaults to `default`): The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, + will resize the image to fit within the specified width and height, maintaining the aspect ratio, and + then center the image within the dimensions, filling empty with data from image. If `crop`, will resize + the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. Returns: `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: @@ -456,19 +460,21 @@ def preprocess( Args: image (`pipeline_image_input`): - The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of supported formats. + The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of + supported formats. height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height. + The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default + height. width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. + The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. resize_mode (`str`, *optional*, defaults to `default`): - The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within + the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will + resize the image to fit within the specified width and height, maintaining the aspect ratio, and then + center the image within the dimensions, filling empty with data from image. If `crop`, will resize the + image to fit within the specified width and height, maintaining the aspect ratio, and then center the + image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`): The crop coordinates for each image in the batch. If `None`, will not crop the image. """ @@ -930,8 +936,8 @@ def __init__( @staticmethod def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int): """ - Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. - If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. + Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the + aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. Args: mask (`torch.FloatTensor`): diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index a4593ec69404..c531d5a519f2 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -67,17 +67,18 @@ def load_ip_adapter( - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). subfolder (`str` or `List[str]`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - If a list is passed, it should have the same length as `weight_name`. + The subfolder location of a model file within a larger model repository on the Hub or locally. If a + list is passed, it should have the same length as `weight_name`. weight_name (`str` or `List[str]`): The name of the weight file to load. If a list is passed, it should have the same length as `weight_name`. image_encoder_folder (`str`, *optional*, defaults to `image_encoder`): The subfolder location of the image encoder within a larger model repository on the Hub or locally. - Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`, - you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`. - If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights, - for example, `image_encoder_folder="different_subfolder/image_encoder"`. + Pass `None` to not load the image encoder. If the image encoder is located in a folder inside + `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g. + `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than + `subfolder`, you should pass the path to the folder that contains image encoder weights, for example, + `image_encoder_folder="different_subfolder/image_encoder"`. cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py index 01dbd3494a4c..5892c2865374 100644 --- a/src/diffusers/loaders/peft.py +++ b/src/diffusers/loaders/peft.py @@ -20,7 +20,8 @@ class PeftAdapterMixin: """ A class containing all functions for loading and using adapters weights that are supported in PEFT library. For - more details about adapters and injecting them in a transformer-based model, check out the PEFT [documentation](https://huggingface.co/docs/peft/index). + more details about adapters and injecting them in a transformer-based model, check out the PEFT + [documentation](https://huggingface.co/docs/peft/index). Install the latest version of PEFT, and use this mixin to: @@ -143,8 +144,8 @@ def disable_adapters(self) -> None: def enable_adapters(self) -> None: """ - Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the - list of adapters to enable. + Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the list of + adapters to enable. If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT [documentation](https://huggingface.co/docs/peft). diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index 0d384b1647d5..752ef18c7a0b 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -198,19 +198,24 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): model_type (`str`, *optional*): The type of model to load. If not provided, the model type will be inferred from the checkpoint file. image_size (`int`, *optional*): - The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE model. + The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE + model. load_safety_checker (`bool`, *optional*, defaults to `False`): - Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a `safety_checker` component is passed to the `kwargs`. + Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a + `safety_checker` component is passed to the `kwargs`. num_in_channels (`int`, *optional*): - Specify the number of input channels for the UNet model. Read more about how to configure UNet model with this parameter + Specify the number of input channels for the UNet model. Read more about how to configure UNet model + with this parameter [here](https://huggingface.co/docs/diffusers/training/adapt_a_model#configure-unet2dconditionmodel-parameters). scaling_factor (`float`, *optional*): - The scaling factor to use for the VAE model. If not provided, it is inferred from the config file first. - If the scaling factor is not found in the config file, the default value 0.18215 is used. + The scaling factor to use for the VAE model. If not provided, it is inferred from the config file + first. If the scaling factor is not found in the config file, the default value 0.18215 is used. scheduler_type (`str`, *optional*): - The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint file. + The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint + file. prediction_type (`str`, *optional*): - The type of prediction to load. If not provided, the prediction type will be inferred from the checkpoint file. + The type of prediction to load. If not provided, the prediction type will be inferred from the + checkpoint file. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline class). The overwritten components are passed directly to the pipelines `__init__` method. See example diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index eb727990af18..c1c224975cb8 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -487,20 +487,35 @@ def unload_textual_inversion( # Example 3: unload from SDXL pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0") - embedding_path = hf_hub_download(repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model") + embedding_path = hf_hub_download( + repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model" + ) # load embeddings to the text encoders state_dict = load_file(embedding_path) # load embeddings of text_encoder 1 (CLIP ViT-L/14) - pipeline.load_textual_inversion(state_dict["clip_l"], token=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) + pipeline.load_textual_inversion( + state_dict["clip_l"], + token=["", ""], + text_encoder=pipeline.text_encoder, + tokenizer=pipeline.tokenizer, + ) # load embeddings of text_encoder 2 (CLIP ViT-G/14) - pipeline.load_textual_inversion(state_dict["clip_g"], token=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) + pipeline.load_textual_inversion( + state_dict["clip_g"], + token=["", ""], + text_encoder=pipeline.text_encoder_2, + tokenizer=pipeline.tokenizer_2, + ) # Unload explicitly from both text encoders abd tokenizers - pipeline.unload_textual_inversion(tokens=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) - pipeline.unload_textual_inversion(tokens=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) - + pipeline.unload_textual_inversion( + tokens=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer + ) + pipeline.unload_textual_inversion( + tokens=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2 + ) ``` """ diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 918a0fca06c8..3ee4a96fad0a 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -74,37 +74,24 @@ def _maybe_expand_lora_scales_for_one_adapter( E.g. turns ```python - scales = { - 'down': 2, - 'mid': 3, - 'up': { - 'block_0': 4, - 'block_1': [5, 6, 7] - } - } - blocks_with_transformer = { - 'down': [1,2], - 'up': [0,1] - } - transformer_per_block = { - 'down': 2, - 'up': 3 - } + scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}} + blocks_with_transformer = {"down": [1, 2], "up": [0, 1]} + transformer_per_block = {"down": 2, "up": 3} ``` into ```python { - 'down.block_1.0': 2, - 'down.block_1.1': 2, - 'down.block_2.0': 2, - 'down.block_2.1': 2, - 'mid': 3, - 'up.block_0.0': 4, - 'up.block_0.1': 4, - 'up.block_0.2': 4, - 'up.block_1.0': 5, - 'up.block_1.1': 6, - 'up.block_1.2': 7, + "down.block_1.0": 2, + "down.block_1.1": 2, + "down.block_2.0": 2, + "down.block_2.1": 2, + "mid": 3, + "up.block_0.0": 4, + "up.block_0.1": 4, + "up.block_0.2": 4, + "up.block_1.0": 5, + "up.block_1.1": 6, + "up.block_1.2": 7, } ``` """ diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 1fd29ce708c8..30086654a2f1 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1298,9 +1298,9 @@ def __call__( class FusedAttnProcessor2_0: r""" - Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). - It uses fused projection layers. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses + fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused. + For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 9bbf2023eb99..b286453de424 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -453,8 +453,8 @@ def forward( # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/controlnet_flax.py b/src/diffusers/models/controlnet_flax.py index 6f9b201aa1e3..0540850a9e61 100644 --- a/src/diffusers/models/controlnet_flax.py +++ b/src/diffusers/models/controlnet_flax.py @@ -329,15 +329,15 @@ def __call__( controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a - plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of + a plain tuple. train (`bool`, *optional*, defaults to `False`): Use deterministic functions and disable dropout when not training. Returns: [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`: - [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is the sample tensor. + [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise + a `tuple`. When returning a tuple, the first element is the sample tensor. """ channel_order = self.controlnet_conditioning_channel_order if channel_order == "bgr": diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 85b1e4944ed2..91bbd58fa025 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -795,16 +795,13 @@ class IPAdapterPlusImageProjection(nn.Module): Args: ---- - embed_dims (int): The feature dimension. Defaults to 768. - output_dims (int): The number of output channels, that is the same - number of the channels in the - `unet.config.cross_attention_dim`. Defaults to 1024. - hidden_dims (int): The number of hidden channels. Defaults to 1280. - depth (int): The number of blocks. Defaults to 8. - dim_head (int): The number of head channels. Defaults to 64. - heads (int): Parallel attention heads. Defaults to 16. - num_queries (int): The number of queries. Defaults to 8. - ffn_ratio (float): The expansion ratio of feedforward network hidden + embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels, + that is the same + number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024. + hidden_dims (int): The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults + to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads. + Defaults to 16. num_queries (int): The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio + of feedforward network hidden layer channels. Defaults to 4. """ diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 88c7a01be6bf..adda53a11481 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -202,8 +202,8 @@ class ResnetBlock2D(nn.Module): eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization. non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use. time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. - By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" - for a stronger conditioning with scale and shift. + By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a + stronger conditioning with scale and shift. kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py index 96849bd28bb1..e2f1b8538ca0 100644 --- a/src/diffusers/models/transformers/dual_transformer_2d.py +++ b/src/diffusers/models/transformers/dual_transformer_2d.py @@ -120,7 +120,8 @@ def forward( `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. Returns: [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`: diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py index a35aa4671e6c..c2d490f3d046 100644 --- a/src/diffusers/models/transformers/transformer_temporal.py +++ b/src/diffusers/models/transformers/transformer_temporal.py @@ -294,8 +294,8 @@ def forward( A tensor indicating whether the input contains only images. 1 indicates that the input contains only images, 0 indicates that the input contains video frames. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain - tuple. + Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a + plain tuple. Returns: [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`: diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 9a710919d067..34327e1049c5 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -865,8 +865,8 @@ def disable_freeu(self): def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -1093,8 +1093,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/models/unets/unet_2d_condition_flax.py b/src/diffusers/models/unets/unet_2d_condition_flax.py index a5ec2875ca0e..edbbcbaeda73 100644 --- a/src/diffusers/models/unets/unet_2d_condition_flax.py +++ b/src/diffusers/models/unets/unet_2d_condition_flax.py @@ -76,7 +76,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`): The tuple of upsample blocks to use. mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): - Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer is skipped. + Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer + is skipped. block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): The tuple of output channels for each block. layers_per_block (`int`, *optional*, defaults to 2): @@ -350,15 +351,15 @@ def __call__( mid_block_additional_residual: (`torch.Tensor`, *optional*): A tensor that if specified is added to the residual of the middle unet block. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a - plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of + a plain tuple. train (`bool`, *optional*, defaults to `False`): Use deterministic functions and disable dropout when not training. Returns: [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`: - [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. - When returning a tuple, the first element is the sample tensor. + [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is the sample tensor. """ # 1. time if not isinstance(timesteps, jnp.ndarray): diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index a827b4ddc5a7..6c353c425911 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -511,8 +511,8 @@ def disable_freeu(self): # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 5c5c6a2cc5ec..0a5f71ed0029 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -99,8 +99,8 @@ def forward( class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep - and returns a sample-shaped output. + I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and + returns a sample-shaped output. This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented for all models (such as downloading or saving). @@ -477,8 +477,8 @@ def disable_freeu(self): # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -533,7 +533,8 @@ def forward( timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". image_latents (`torch.FloatTensor`): Image encodings from the VAE. - image_embeddings (`torch.FloatTensor`): Projection embeddings of the conditioning image computed with a vision encoder. + image_embeddings (`torch.FloatTensor`): + Projection embeddings of the conditioning image computed with a vision encoder. encoder_hidden_states (`torch.FloatTensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. cross_attention_kwargs (`dict`, *optional*): diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 88c0b967c099..595b7b03571c 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -709,8 +709,8 @@ def disable_freeu(self) -> None: # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 5fe265e63fc5..0f89df8c6bff 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -31,8 +31,8 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample - shaped output. + A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and + returns a sample shaped output. This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented for all models (such as downloading or saving). @@ -57,7 +57,8 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL The dimension of the cross attention features. transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1): The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for - [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`], + [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], + [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`], [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`]. num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`): The number of attention heads. @@ -374,12 +375,12 @@ def forward( The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal embeddings and added to the time embeddings. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain - tuple. + Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead + of a plain tuple. Returns: [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is + returned, otherwise a `tuple` is returned where the first element is the sample tensor. """ # 1. time timesteps = timestep diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index 6227f7413a3c..ff76415ecf0e 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -186,7 +186,8 @@ def __init__( block_out_channels (Tuple[int], defaults to (2048, 2048)): Tuple of output channels for each block. num_attention_heads (Tuple[int], defaults to (32, 32)): - Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention. + Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have + attention. down_num_layers_per_block (Tuple[int], defaults to [8, 24]): Number of layers in each down block. up_num_layers_per_block (Tuple[int], defaults to [24, 8]): @@ -197,10 +198,9 @@ def __init__( Number of 1x1 Convolutional layers to repeat in each up block. block_types_per_layer (Tuple[Tuple[str]], optional, defaults to ( - ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), - ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock") - ): - Block types used in each layer of the up/down blocks. + ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock", + "SDCascadeTimestepBlock", "SDCascadeAttnBlock") + ): Block types used in each layer of the up/down blocks. clip_text_in_channels (`int`, *optional*, defaults to `None`): Number of input channels for CLIP based text conditioning. clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280): diff --git a/src/diffusers/pipelines/amused/pipeline_amused.py b/src/diffusers/pipelines/amused/pipeline_amused.py index aa682b46fe70..994455ff29db 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused.py +++ b/src/diffusers/pipelines/amused/pipeline_amused.py @@ -30,9 +30,7 @@ >>> import torch >>> from diffusers import AmusedPipeline - >>> pipe = AmusedPipeline.from_pretrained( - ... "amused/amused-512", variant="fp16", torch_dtype=torch.float16 - ... ) + >>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16) >>> pipe = pipe.to("cuda") >>> prompt = "a photo of an astronaut riding a horse on mars" @@ -150,10 +148,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py index 8b49d1a64578..1218e7a44c4d 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py @@ -167,10 +167,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py index 423f5734b478..ab0a55cdd388 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py @@ -191,10 +191,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 12347227a15e..3765db938cd5 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -639,10 +639,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 43d334439532..3677a9fde145 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -52,14 +52,21 @@ >>> from io import BytesIO >>> from PIL import Image - >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16) - >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda") - >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace") + >>> adapter = MotionAdapter.from_pretrained( + ... "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16 + ... ) + >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained( + ... "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter + ... ).to("cuda") + >>> pipe.scheduler = DDIMScheduler( + ... beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace" + ... ) + >>> def load_video(file_path: str): ... images = [] - ... - ... if file_path.startswith(('http://', 'https://')): + + ... if file_path.startswith(("http://", "https://")): ... # If the file_path is a URL ... response = requests.get(file_path) ... response.raise_for_status() @@ -68,15 +75,20 @@ ... else: ... # Assuming it's a local file path ... vid = imageio.get_reader(file_path) - ... + ... for frame in vid: ... pil_image = Image.fromarray(frame) ... images.append(pil_image) - ... + ... return images - >>> video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif") - >>> output = pipe(video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5) + + >>> video = load_video( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" + ... ) + >>> output = pipe( + ... video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5 + ... ) >>> frames = output.frames[0] >>> export_to_gif(frames, "animation.gif") ``` @@ -135,8 +147,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -799,16 +811,15 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`AnimateDiffPipelineOutput`] instead - of a plain tuple. + Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). diff --git a/src/diffusers/pipelines/animatediff/pipeline_output.py b/src/diffusers/pipelines/animatediff/pipeline_output.py index 184a45848a37..97e7c87ad7f7 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_output.py +++ b/src/diffusers/pipelines/animatediff/pipeline_output.py @@ -15,7 +15,8 @@ class AnimateDiffPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index c0b85e4db5f6..70bab832eea2 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -701,8 +701,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 3c69fb06332c..b8443cb3b56c 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -107,8 +107,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -922,9 +922,9 @@ def __call__( accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet, - each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets, - where a list of image lists can be passed to batch for each prompt and each ControlNet. + input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single + ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple + ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -962,10 +962,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 403fe6a9e797..a5a0aaed0f2e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -978,10 +978,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index ddc0983f304d..66e88aaade1f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1167,11 +1167,12 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information irrelevant for inpainting, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends @@ -1207,10 +1208,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 3eb8f31b6a26..72904c62f97b 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -1194,11 +1194,12 @@ def __call__( width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information irrelevant for inpainting, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. strength (`float`, *optional*, defaults to 0.9999): Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the @@ -1247,10 +1248,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index d6591aa26f2a..e17941383bad 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -1039,10 +1039,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 6c00e2f3fc4b..5938d99b734e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -1178,10 +1178,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index e4583699e79e..9a1bb5e78fb1 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -89,8 +89,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index 156e52c249d9..48b3b96483d5 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -129,8 +129,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index 550756cd80d8..1c55d088aa0a 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -1000,8 +1000,8 @@ def disable_freeu(self): def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -1112,8 +1112,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/pipelines/free_init_utils.py b/src/diffusers/pipelines/free_init_utils.py index a6eabc930172..4f7965a038c5 100644 --- a/src/diffusers/pipelines/free_init_utils.py +++ b/src/diffusers/pipelines/free_init_utils.py @@ -41,20 +41,20 @@ def enable_free_init( num_iters (`int`, *optional*, defaults to `3`): Number of FreeInit noise re-initialization iterations. use_fast_sampling (`bool`, *optional*, defaults to `False`): - Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables - the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`. + Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the + "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`. method (`str`, *optional*, defaults to `butterworth`): - Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the - FreeInit low pass filter. + Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low + pass filter. order (`int`, *optional*, defaults to `4`): Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour whereas lower values lead to `gaussian` method behaviour. spatial_stop_frequency (`float`, *optional*, defaults to `0.25`): - Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in - the original implementation. + Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the + original implementation. temporal_stop_frequency (`float`, *optional*, defaults to `0.25`): - Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in - the original implementation. + Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the + original implementation. """ self._free_init_num_iters = num_iters self._free_init_use_fast_sampling = use_fast_sampling diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index cb6f3e300904..a6b9499f5542 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -43,10 +43,14 @@ >>> from diffusers import I2VGenXLPipeline >>> from diffusers.utils import export_to_gif, load_image - >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") + >>> pipeline = I2VGenXLPipeline.from_pretrained( + ... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16" + ... ) >>> pipeline.enable_model_cpu_offload() - >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" + >>> image_url = ( + ... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" + ... ) >>> image = load_image(image_url).convert("RGB") >>> prompt = "Papers were floating in the air on a table in the library" @@ -59,7 +63,7 @@ ... num_inference_steps=50, ... negative_prompt=negative_prompt, ... guidance_scale=9.0, - ... generator=generator + ... generator=generator, ... ).frames[0] >>> video_path = export_to_gif(frames, "i2v.gif") ``` @@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ @@ -551,7 +556,8 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. target_fps (`int`, *optional*): - Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation. + Frames per second. The rate at which the generated images shall be exported to a video after + generation. This is also used as a "micro-condition" while generation. num_frames (`int`, *optional*): The number of video frames to generate. num_inference_steps (`int`, *optional*): @@ -568,9 +574,9 @@ def __call__( num_videos_per_prompt (`int`, *optional*): The number of images to generate per prompt. decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency - between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once - for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. + The number of frames to decode at a time. The higher the chunk size, the higher the temporal + consistency between frames, but also the higher the memory consumption. By default, the decoder will + decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py b/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py index 4fe8c54eb7fc..5360632275b4 100755 --- a/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +++ b/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py @@ -35,10 +35,10 @@ def convert_state_dict(unet_state_dict): """ - Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model. Args: - unet_model (torch.nn.Module): The original U-Net model. - unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with. + Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model. + unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet + model to match keys with. Returns: OrderedDict: The converted state dictionary. diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index fcf7ddcb9966..076b3f77d477 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -24,7 +24,9 @@ >>> from diffusers import AutoPipelineForText2Image >>> import torch - >>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) + >>> pipe = AutoPipelineForText2Image.from_pretrained( + ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 7f4164a04d1e..755e5089299c 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -29,11 +29,15 @@ >>> from diffusers.utils import load_image >>> import torch - >>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) + >>> pipe = AutoPipelineForImage2Image.from_pretrained( + ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "A painting of the inside of a subway train with tiny raccoons." - >>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png") + >>> image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png" + ... ) >>> generator = torch.Generator(device="cpu").manual_seed(0) >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0] diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index e8482ffe9ce2..8957d7140ef1 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -73,8 +73,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -749,10 +749,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 259a65c80782..286ba623331f 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -77,8 +77,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -681,10 +681,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index a6357c4cd3a1..619be13a8f36 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -40,30 +40,21 @@ >>> from io import BytesIO >>> from diffusers import LEditsPPPipelineStableDiffusion + >>> from diffusers.utils import load_image >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained( ... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ... ) >>> pipe = pipe.to("cuda") - >>> def download_image(url): - ... response = requests.get(url) - ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") - >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png" - >>> image = download_image(img_url) + >>> image = load_image(img_url).convert("RGB") - >>> _ = pipe.invert( - ... image = image, - ... num_inversion_steps=50, - ... skip=0.1 - ... ) + >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1) >>> edited_image = pipe( - ... editing_prompt=["cherry blossom"], - ... edit_guidance_scale=10.0, - ... edit_threshold=0.75, - ).images[0] + ... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75 + ... ).images[0] ``` """ @@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion( unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of - [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically - be set to [`DPMSolverMultistepScheduler`]. + [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will + automatically be set to [`DPMSolverMultistepScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. @@ -531,8 +522,7 @@ def encode_prompt( `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). editing_prompt (`str` or `List[str]`, *optional*): - Editing prompt(s) to be encoded. If not defined, one has to pass - `editing_prompt_embeds` instead. + Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead. editing_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. @@ -734,8 +724,9 @@ def __call__( **kwargs, ): r""" - The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] - method has to be called beforehand. Edits will always be performed for the last inverted image(s). + The call function to the pipeline for editing. The + [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will + always be performed for the last inverted image(s). Args: negative_prompt (`str` or `List[str]`, *optional*): @@ -748,49 +739,51 @@ def __call__( The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a - plain tuple. + Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain + tuple. editing_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. The image is reconstructed by setting - `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. + `editing_prompt = None`. Guidance direction of prompt should be specified via + `reverse_editing_direction`. editing_prompt_embeds (`torch.Tensor>`, *optional*): - Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be - specified via `reverse_editing_direction`. + Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should + be specified via `reverse_editing_direction`. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): Whether the corresponding prompt in `editing_prompt` should be increased or decreased. edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): - Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. - `edit_guidance_scale` is defined as `s_e` of equation 12 of - [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + Guidance scale for guiding the image generation. If provided as list values should correspond to + `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): Number of diffusion steps (for each prompt) for which guidance will not be applied. edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): Number of diffusion steps (for each prompt) after which guidance will no longer be applied. edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): Masking threshold of guidance. Threshold should be proportional to the image region that is modified. - 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). user_mask (`torch.FloatTensor`, *optional*): - User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit - masks do not meet user preferences. + User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s + implicit masks do not meet user preferences. sem_guidance (`List[torch.Tensor]`, *optional*): List of pre-generated guidance vectors to be applied at generation. Length of the list has to correspond to `num_inference_steps`. use_cross_attn_mask (`bool`, defaults to `False`): Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask - is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of - [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++ + paper](https://arxiv.org/pdf/2311.16711.pdf). use_intersect_mask (`bool`, defaults to `True`): - Whether the masking term is calculated as intersection of cross-attention masks and masks derived - from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise - estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + Whether the masking term is calculated as intersection of cross-attention masks and masks derived from + the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate + are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). attn_store_steps (`List[int]`, *optional*): Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. store_averaged_over_steps (`bool`, defaults to `True`): - Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. - If False, attention maps for each step are stores separately. Just for visualization purposes. + Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If + False, attention maps for each step are stores separately. Just for visualization purposes. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). @@ -815,10 +808,10 @@ def __call__( Returns: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, - otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the - second element is a list of `bool`s denoting whether the corresponding generated image likely represents - "not-safe-for-work" (nsfw) content, according to the `safety_checker`. + [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + content, according to the `safety_checker`. """ if self.inversion_steps is None: @@ -1219,9 +1212,9 @@ def invert( crops_coords: Optional[Tuple[int, int, int, int]] = None, ): r""" - The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) - will be performed instead. + The function to the pipeline for image inversion as described by the [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the + inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. Args: image (`PipelineImageInput`): @@ -1238,8 +1231,8 @@ def invert( Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. generator (`torch.Generator`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - inversion deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion + deterministic. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). @@ -1247,23 +1240,24 @@ def invert( Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height. + The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default + height. width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. + The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. resize_mode (`str`, *optional*, defaults to `default`): - The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within + the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will + resize the image to fit within the specified width and height, maintaining the aspect ratio, and then + center the image within the dimensions, filling empty with data from image. If `crop`, will resize the + image to fit within the specified width and height, maintaining the aspect ratio, and then center the + image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`): The crop coordinates for each image in the batch. If `None`, will not crop the image. Returns: - [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: - Output will contain the resized input image(s) and respective VAE reconstruction(s). + [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s) + and respective VAE reconstruction(s). """ # Reset attn processor, we do not want to store attn maps during inversion self.unet.set_attn_processor(AttnProcessor()) diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index b1f773cb864b..cfab70926a4a 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -85,25 +85,23 @@ ... ) >>> pipe = pipe.to("cuda") + >>> def download_image(url): ... response = requests.get(url) ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") + >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg" >>> image = download_image(img_url) - >>> _ = pipe.invert( - ... image = image, - ... num_inversion_steps=50, - ... skip=0.2 - ... ) + >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2) >>> edited_image = pipe( - ... editing_prompt=["tennis ball","tomato"], - ... reverse_editing_direction=[True,False], - ... edit_guidance_scale=[5.0,10.0], - ... edit_threshold=[0.9,0.85], - ).images[0] + ... editing_prompt=["tennis ball", "tomato"], + ... reverse_editing_direction=[True, False], + ... edit_guidance_scale=[5.0, 10.0], + ... edit_threshold=[0.9, 0.85], + ... ).images[0] ``` """ @@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL( """ Pipeline for textual image editing using LEDits++ with Stable Diffusion XL. - This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass - documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular - device, etc.). + This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the + superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a + particular device, etc.). In addition the pipeline inherits the following loading methods: - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`] @@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL( unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of - [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically - be set to [`DPMSolverMultistepScheduler`]. + [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will + automatically be set to [`DPMSolverMultistepScheduler`]. force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of `stabilityai/stable-diffusion-xl-base-1-0`. @@ -453,9 +451,9 @@ def encode_prompt( Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass `editing_prompt_embeds` instead. editing_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input - argument. + Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from + `editing_prompt` input argument. editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` @@ -835,8 +833,9 @@ def __call__( **kwargs, ): r""" - The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] - method has to be called beforehand. Edits will always be performed for the last inverted image(s). + The call function to the pipeline for editing. The + [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits + will always be performed for the last inverted image(s). Args: denoising_end (`float`, *optional*): @@ -894,11 +893,11 @@ def __call__( section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). editing_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. The image is reconstructed by setting - `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. + `editing_prompt = None`. Guidance direction of prompt should be specified via + `reverse_editing_direction`. editing_prompt_embeddings (`torch.Tensor`, *optional*): - Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input - argument. + Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument. editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*): Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input @@ -906,35 +905,36 @@ def __call__( reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): Whether the corresponding prompt in `editing_prompt` should be increased or decreased. edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): - Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. - `edit_guidance_scale` is defined as `s_e` of equation 12 of - [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + Guidance scale for guiding the image generation. If provided as list values should correspond to + `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): Number of diffusion steps (for each prompt) for which guidance is not applied. edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): Number of diffusion steps (for each prompt) after which guidance is no longer applied. edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): Masking threshold of guidance. Threshold should be proportional to the image region that is modified. - 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). sem_guidance (`List[torch.Tensor]`, *optional*): List of pre-generated guidance vectors to be applied at generation. Length of the list has to correspond to `num_inference_steps`. use_cross_attn_mask: Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask - is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of - [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++ + paper](https://arxiv.org/pdf/2311.16711.pdf). use_intersect_mask: - Whether the masking term is calculated as intersection of cross-attention masks and masks derived - from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise - estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + Whether the masking term is calculated as intersection of cross-attention masks and masks derived from + the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate + are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). user_mask: - User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit - masks do not meet user preferences. + User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s + implicit masks do not meet user preferences. attn_store_steps: Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. store_averaged_over_steps: - Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. - If False, attention maps for each step are stores separately. Just for visualization purposes. + Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If + False, attention maps for each step are stores separately. Just for visualization purposes. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. @@ -952,8 +952,8 @@ def __call__( Returns: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, - otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. + [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images. """ if self.inversion_steps is None: raise ValueError( @@ -1446,9 +1446,9 @@ def invert( cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" - The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) - will be performed instead. + The function to the pipeline for image inversion as described by the [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the + inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. Args: image (`PipelineImageInput`): @@ -1472,8 +1472,8 @@ def invert( Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. generator (`torch.Generator`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - inversion deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion + deterministic. crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting @@ -1488,8 +1488,8 @@ def invert( [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). Returns: - [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: - Output will contain the resized input image(s) and respective VAE reconstruction(s). + [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s) + and respective VAE reconstruction(s). """ # Reset attn processor, we do not want to store attn maps during inversion diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_output.py b/src/diffusers/pipelines/ledits_pp/pipeline_output.py index b90005c97c4a..756be82b0069 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_output.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_output.py @@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput): List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape ` (batch_size, height, width, num_channels)`. vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`) - List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape ` - (batch_size, height, width, num_channels)`. + List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape + ` (batch_size, height, width, num_channels)`. """ images: Union[List[PIL.Image.Image], np.ndarray] diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 1bd9d087dc98..aceb95ae0451 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -59,6 +59,7 @@ ... PIAPipeline, ... ) >>> from diffusers.utils import export_to_gif, load_image + >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers") >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter) >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) @@ -135,9 +136,9 @@ class PIAPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, - NumPy array of shape `(batch_size, num_frames, channels, height, width, - Torch tensor of shape `(batch_size, num_frames, channels, height, width)`. + Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of + shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames, + channels, height, width)`. """ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] @@ -759,16 +760,15 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. motion_scale: (`int`, *optional*, defaults to 0): - Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific - ranges of values control the type of motion that is added. Must be between 0 and 8. - Set between 0-2 to only increase the amount of motion. - Set between 3-5 to create looping motion. - Set between 6-8 to perform motion with image style transfer. + Parameter that controls the amount and type of motion that is added to the image. Increasing the value + increases the amount of motion, while specific ranges of values control the type of motion that is + added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5 + to create looping motion. Set between 6-8 to perform motion with image style transfer. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. @@ -795,8 +795,8 @@ def __call__( Returns: [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is - returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element is a list with the generated frames. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index a98d736aa557..0ed27293c178 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -538,7 +538,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P allowed by Git. custom_revision (`str`, *optional*): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version. + `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers + version. mirror (`str`, *optional*): Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more @@ -1669,7 +1670,8 @@ def set_attention_slice(self, slice_size: Optional[int]): @classmethod def from_pipe(cls, pipeline, **kwargs): r""" - Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing pipeline components without reallocating additional memory. + Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing + pipeline components without reallocating additional memory. Arguments: pipeline (`DiffusionPipeline`): @@ -1851,8 +1853,8 @@ def disable_freeu(self): def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index e7213a38bcad..608aa4eb1905 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -186,8 +186,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index a5d9f06a59b6..65ac21f22007 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -334,8 +334,8 @@ def __call__( argument. negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input - argument. + weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` + input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py index ecc92bbb8819..d27e727231c9 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py @@ -31,7 +31,10 @@ ```py >>> import torch >>> from diffusers import StableCascadeCombinedPipeline - >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16) + + >>> pipe = StableCascadeCombinedPipeline.from_pretrained( + ... "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet" >>> images = pipe(prompt=prompt) diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index 24ccc4b882e9..55fb4c28f6dd 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline): prior ([`StableCascadeUNet`]): The Stable Cascade prior to approximate the image embedding from the text and/or image embedding. text_encoder ([`CLIPTextModelWithProjection`]): - Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). + Frozen text-encoder + ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). feature_extractor ([`~transformers.CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `image_encoder`. image_encoder ([`CLIPVisionModelWithProjection`]): @@ -420,11 +421,11 @@ def __call__( argument. negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input - argument. + weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` + input argument. image_embeds (`torch.FloatTensor`, *optional*): - Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. - If not provided, image embeddings will be generated from `image` input argument if existing. + Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If + not provided, image embeddings will be generated from `image` input argument if existing. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -452,9 +453,9 @@ def __call__( Examples: Returns: - [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if - `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the - generated image embeddings. + [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is + True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image + embeddings. """ # 0. Define commonly used variables diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index b9b9b60e759d..5305f70cab57 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -85,8 +85,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -801,10 +801,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 540eed6ebd56..1b31c099b177 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -125,8 +125,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -897,10 +897,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f0fc55873b21..6deac85b7350 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -189,8 +189,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1022,11 +1022,12 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information irrelevant for inpainting, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends @@ -1066,10 +1067,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 170551312782..7eae699ba4d2 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -90,8 +90,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -773,10 +773,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index cd5189b85e68..bd7cc443fecb 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -90,8 +90,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -694,9 +694,9 @@ def get_views( circular_padding: bool = False, ) -> List[Tuple[int, int, int, int]]: """ - Generates a list of views based on the given parameters. - Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113). - If panorama's height/width < window_size, num_blocks of height/width should return 1. + Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the + MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of + height/width should return 1. Args: panorama_height (int): The height of the panorama. @@ -706,8 +706,8 @@ def get_views( circular_padding (bool, optional): Whether to apply circular padding. Defaults to False. Returns: - List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains - four integers representing the start and end coordinates of the window in the panorama. + List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers + representing the start and end coordinates of the window in the panorama. """ panorama_height /= 8 @@ -800,8 +800,8 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): - The timesteps at which to generate the images. If not specified, then the default - timestep spacing strategy of the scheduler is used. + The timesteps at which to generate the images. If not specified, then the default timestep spacing + strategy of the scheduler is used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -832,10 +832,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index 2e7a1fa41b58..1c1464a4271e 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -619,8 +619,8 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the + `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index efad9cf6cc1b..7481a001f4b1 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -117,8 +117,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -919,10 +919,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 9f6227bc914a..8bcfcfbfe57a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -134,8 +134,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1067,10 +1067,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 378f53ab0844..3f37d6f56ff3 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -279,8 +279,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1255,11 +1255,12 @@ def __call__( [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and checkpoints that are not specifically fine-tuned on low resolutions. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information irrelevant for inpainting, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. strength (`float`, *optional*, defaults to 0.9999): Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the @@ -1319,10 +1320,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index 1342fe429145..ae4e12642242 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -37,10 +37,14 @@ >>> from diffusers import StableVideoDiffusionPipeline >>> from diffusers.utils import load_image, export_to_video - >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") + >>> pipe = StableVideoDiffusionPipeline.from_pretrained( + ... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" + ... ) >>> pipe.to("cuda") - >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg") + >>> image = load_image( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg" + ... ) >>> image = image.resize((1024, 576)) >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] @@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Args: frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): - List of denoised PIL images of length `batch_size` or numpy array or torch tensor - of shape `(batch_size, num_frames, height, width, num_channels)`. + List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, + num_frames, height, width, num_channels)`. """ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] @@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): vae ([`AutoencoderKLTemporalDecoder`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): - Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). + Frozen CLIP image-encoder + ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). unet ([`UNetSpatioTemporalConditionModel`]): A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. scheduler ([`EulerDiscreteScheduler`]): @@ -357,14 +362,15 @@ def __call__( Args: image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): - Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. + Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, + 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. num_frames (`int`, *optional*): - The number of video frames to generate. Defaults to `self.unet.config.num_frames` - (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). + The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for + `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). num_inference_steps (`int`, *optional*, defaults to 25): The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference. This parameter is modulated by `strength`. @@ -373,16 +379,18 @@ def __call__( max_guidance_scale (`float`, *optional*, defaults to 3.0): The maximum guidance scale. Used for the classifier free guidance with last frame. fps (`int`, *optional*, defaults to 7): - Frames per second. The rate at which the generated images shall be exported to a video after generation. - Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. + Frames per second. The rate at which the generated images shall be exported to a video after + generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. motion_bucket_id (`int`, *optional*, defaults to 127): Used for conditioning the amount of motion for the generation. The higher the number the more motion will be in the video. noise_aug_strength (`float`, *optional*, defaults to 0.02): - The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. + The amount of noise added to the init image, the higher it is the less the video will look like the + init image. Increase it for more motion. decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal - quality. For lower memory usage, reduce `decode_chunk_size`. + The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the + expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality. + For lower memory usage, reduce `decode_chunk_size`. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -398,7 +406,8 @@ def __call__( A function that is called at the end of each denoising step during inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. - `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -411,8 +420,9 @@ def __call__( Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, - otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) + is returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 10f8dc66f79d..ffb1699f66d1 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -134,8 +134,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 59d4022923eb..b2bda39e6de6 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -150,8 +150,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -943,10 +943,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py index c155386cf173..2dae5b4ead69 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py @@ -17,7 +17,8 @@ class TextToVideoSDPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py index c074b9916301..6579e272a3bf 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -752,7 +752,8 @@ def forward( cross_attention_kwargs (*optional*): Keyword arguments to supply to the cross attention layers, if used. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. hidden_states_is_embedding (`bool`, *optional*, defaults to `False`): Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py index dc3d8455bdfe..23c71a61452a 100644 --- a/src/diffusers/schedulers/scheduling_ddim_flax.py +++ b/src/diffusers/schedulers/scheduling_ddim_flax.py @@ -85,7 +85,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): trained_betas (`jnp.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. clip_sample (`bool`, default `True`): - option to clip predicted sample between for numerical stability. The clip range is determined by `clip_sample_range`. + option to clip predicted sample between for numerical stability. The clip range is determined by + `clip_sample_range`. clip_sample_range (`float`, default `1.0`): the maximum magnitude for sample clipping. Valid only when `clip_sample=True`. set_alpha_to_one (`bool`, default `True`): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 5b452bddba70..7e0939e0d927 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -166,8 +166,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of `lambda(t)`. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. lambda_min_clipped (`float`, defaults to `-inf`): Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the cosine (`squaredcos_cap_v2`) noise schedule. diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 215d94a7863f..d7a073c2383e 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -108,11 +108,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `dpmsolver++`): - Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The - `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) - paper, and the `dpmsolver++` type implements the algorithms in the - [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or - `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the + algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type + implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is + recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in + Stable Diffusion. solver_type (`str`, defaults to `midpoint`): Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers. @@ -123,8 +123,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`, the sigmas are determined according to a sequence of noise levels {σi}. final_sigmas_type (`str`, *optional*, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. lambda_min_clipped (`float`, defaults to `-inf`): Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the cosine (`squaredcos_cap_v2`) noise schedule. diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index 9422d57cff89..26a41d7335c5 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -62,10 +62,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `dpmsolver++`): - Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The - `dpmsolver++` type implements the algorithms in the - [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or - `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver++` type implements + the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to + use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. solver_type (`str`, defaults to `midpoint`): Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers. @@ -77,8 +76,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference steps, but sometimes may result in blurring. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. """ _compatibles = [] diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index bad6aeff8b62..f6a09ca1ee16 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -278,8 +278,7 @@ def step( generator (`torch.Generator`, *optional*): A random number generator. return_dict (`bool`): - Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or - tuple. + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or tuple. Returns: [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`: diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index a80cc66a393d..b8d95c609bf1 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -92,19 +92,20 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): trained_betas (`np.ndarray`, *optional*): Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. predictor_order (`int`, defaults to 2): - The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided - sampling, and `predictor_order=3` for unconditional sampling. + The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for + guided sampling, and `predictor_order=3` for unconditional sampling. corrector_order (`int`, defaults to 2): - The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided - sampling, and `corrector_order=3` for unconditional sampling. + The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for + guided sampling, and `corrector_order=3` for unconditional sampling. prediction_type (`str`, defaults to `epsilon`, *optional*): Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf) paper). tau_func (`Callable`, *optional*): - Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver - will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla - diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019 + Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. + SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample + from vanilla diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check + https://arxiv.org/abs/2309.05019 thresholding (`bool`, defaults to `False`): Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such as Stable Diffusion. @@ -114,8 +115,8 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `data_prediction`): - Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction` - with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use + `data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion. lower_order_final (`bool`, defaults to `True`): Whether to use lower-order solvers in the final steps. Default = True. use_karras_sigmas (`bool`, *optional*, defaults to `False`): @@ -402,14 +403,14 @@ def convert_model_output( **kwargs, ) -> torch.FloatTensor: """ - Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is - designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an - integral of the data prediction model. + Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. + Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is + designed to discretize an integral of the data prediction model. - The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise - prediction and data prediction models. + The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both + noise prediction and data prediction models. diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index ee3cde5d2142..0216b7afc80a 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -132,8 +132,8 @@ def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: class TCDScheduler(SchedulerMixin, ConfigMixin): """ - `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency Distillation`, - extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal. + `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency + Distillation`, extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal. This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD). @@ -543,8 +543,9 @@ def step( sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. eta (`float`): - A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every step. - When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling. + A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every + step. When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic + sampling. generator (`torch.Generator`, *optional*): A random number generator. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 70e63a64c0a8..c95ea43e55e3 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -128,8 +128,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): steps_offset (`int`, defaults to 0): An offset added to the inference steps, as required by some model families. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index f744c1dfb1aa..add95812122c 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -246,8 +246,8 @@ def get_cached_module_file( - You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private - or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). + You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or + [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). @@ -434,8 +434,8 @@ def get_class_from_dynamic_module( - You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private - or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). + You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or + [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index e554b42ddd31..d70ee53aaa41 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -112,7 +112,8 @@ def load_or_create_model_card( repo_id_or_path (`str`): The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card. token (`str`, *optional*): - Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more details. + Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more + details. is_pipeline (`bool`): Boolean to indicate if we're adding tag to a [`DiffusionPipeline`]. from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script. diff --git a/src/diffusers/utils/loading_utils.py b/src/diffusers/utils/loading_utils.py index 18f6ead64c4e..aa087e981731 100644 --- a/src/diffusers/utils/loading_utils.py +++ b/src/diffusers/utils/loading_utils.py @@ -16,8 +16,8 @@ def load_image( image (`str` or `PIL.Image.Image`): The image to convert to the PIL Image format. convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], optional): - A conversion method to apply to the image after loading it. - When set to `None` the image will be converted "RGB". + A conversion method to apply to the image after loading it. When set to `None` the image will be converted + "RGB". Returns: `PIL.Image.Image`: diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py index 35fc4210a908..dc303a35a8e3 100644 --- a/src/diffusers/utils/state_dict_utils.py +++ b/src/diffusers/utils/state_dict_utils.py @@ -253,8 +253,8 @@ def convert_unet_state_dict_to_peft(state_dict): def convert_all_state_dict_to_peft(state_dict): r""" - Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` - for a valid `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft` + Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid + `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft` """ try: peft_dict = convert_state_dict_to_peft(state_dict) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 4ea541dac356..8fa16ddb9d9f 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -156,8 +156,8 @@ def get_tests_dir(append_path=None): # https://github.com/huggingface/accelerate/pull/1964 def str_to_bool(value) -> int: """ - Converts a string representation of truth to `True` (1) or `False` (0). - True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; + Converts a string representation of truth to `True` (1) or `False` (0). True values are `y`, `yes`, `t`, `true`, + `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; """ value = value.lower() if value in ("y", "yes", "t", "true", "on", "1"):