feat(diffusers/models): add models

mindspore-lab · Jun 20, 2024 · 2a6bbde · 2a6bbde
1 parent 6271bc5
commit 2a6bbde
Show file tree

Hide file tree

Showing 35 changed files with 11,923 additions and 511 deletions.
diff --git a/mindone/diffusers/README.md b/mindone/diffusers/README.md
@@ -97,20 +97,7 @@ Most base, utility and mixin class are available.
 - [ ] StableDiffusionPipeline
 
 ### Model
-
-#### AutoEncoders
-
-- [x] AutoencoderKL
-
-#### UNets
-
-- [x] UNet1DModel
-- [x] UNet2DConditionModel
-- [x] StableCascadeUNet
-
-#### Transformers
-
-- [x] Transformer2DModel
+- All Supported
 
 ### Scheduler
 - [x] DDIMScheduler/DDPMScheduler/...(30)
@@ -134,6 +121,7 @@ Most base, utility and mixin class are available.
 Unlike the output `posterior = DiagonalGaussianDistribution(latent)`, which can do sampling by `posterior.sample()`.
 We can only output the `latent` and then do sampling through `AutoencoderKL.diag_gauss_dist.sample(latent)`.
 
+
 ## Credits
 
 Hacked together @geniuspatrick.

diff --git a/mindone/diffusers/__init__.py b/mindone/diffusers/__init__.py
@@ -14,13 +14,31 @@
 _import_structure = {
     "configuration_utils": ["ConfigMixin"],
     "models": [
+        "AsymmetricAutoencoderKL",
         "AutoencoderKL",
+        "AutoencoderKLTemporalDecoder",
+        "AutoencoderTiny",
+        "ConsistencyDecoderVAE",
+        "ControlNetModel",
+        "I2VGenXLUNet",
+        "Kandinsky3UNet",
         "ModelMixin",
+        "MotionAdapter",
+        "MultiAdapter",
+        "PriorTransformer",
+        "T2IAdapter",
+        "T5FilmDecoder",
+        "Transformer2DModel",
         "SD3Transformer2DModel",
+        "StableCascadeUNet",
         "UNet1DModel",
         "UNet2DConditionModel",
         "UNet2DModel",
-        "StableCascadeUNet",
+        "UNet3DConditionModel",
+        "UNetMotionModel",
+        "UNetSpatioTemporalConditionModel",
+        "UVit2DModel",
+        "VQModel",
     ],
     "optimization": [
         "get_constant_schedule",
@@ -80,13 +98,31 @@
 if TYPE_CHECKING:
     from .configuration_utils import ConfigMixin
     from .models import (
+        AsymmetricAutoencoderKL,
         AutoencoderKL,
+        AutoencoderKLTemporalDecoder,
+        AutoencoderTiny,
+        ConsistencyDecoderVAE,
+        ControlNetModel,
+        I2VGenXLUNet,
+        Kandinsky3UNet,
         ModelMixin,
+        MotionAdapter,
+        MultiAdapter,
+        PriorTransformer,
         SD3Transformer2DModel,
         StableCascadeUNet,
+        T2IAdapter,
+        T5FilmDecoder,
+        Transformer2DModel,
         UNet1DModel,
         UNet2DConditionModel,
         UNet2DModel,
+        UNet3DConditionModel,
+        UNetMotionModel,
+        UNetSpatioTemporalConditionModel,
+        UVit2DModel,
+        VQModel,
     )
     from .optimization import (
         get_constant_schedule,

diff --git a/mindone/diffusers/models/__init__.py b/mindone/diffusers/models/__init__.py
@@ -18,26 +18,67 @@
 
 _import_structure = {
     "adapter": ["MultiAdapter", "T2IAdapter"],
+    "autoencoders.autoencoder_asym_kl": ["AsymmetricAutoencoderKL"],
     "autoencoders.autoencoder_kl": ["AutoencoderKL"],
+    "autoencoders.autoencoder_kl_temporal_decoder": ["AutoencoderKLTemporalDecoder"],
+    "autoencoders.autoencoder_tiny": ["AutoencoderTiny"],
+    "autoencoders.consistency_decoder_vae": ["ConsistencyDecoderVAE"],
     "controlnet": ["ControlNetModel"],
+    "dual_transformer_2d": ["DualTransformer2DModel"],
     "embeddings": ["ImageProjection"],
     "modeling_utils": ["ModelMixin"],
+    "transformers.prior_transformer": ["PriorTransformer"],
+    "transformers.t5_film_transformer": ["T5FilmDecoder"],
     "transformers.transformer_2d": ["Transformer2DModel"],
+    "transformers.transformer_temporal": ["TransformerTemporalModel"],
     "transformers.transformer_sd3": ["SD3Transformer2DModel"],
     "unets.unet_1d": ["UNet1DModel"],
     "unets.unet_2d": ["UNet2DModel"],
     "unets.unet_2d_condition": ["UNet2DConditionModel"],
+    "unets.unet_3d_condition": ["UNet3DConditionModel"],
+    "unets.unet_i2vgen_xl": ["I2VGenXLUNet"],
+    "unets.unet_kandinsky3": ["Kandinsky3UNet"],
+    "unets.unet_motion_model": ["MotionAdapter", "UNetMotionModel"],
     "unets.unet_stable_cascade": ["StableCascadeUNet"],
+    "unets.unet_spatio_temporal_condition": ["UNetSpatioTemporalConditionModel"],
+    "unets.uvit_2d": ["UVit2DModel"],
+    "vq_model": ["VQModel"],
 }
 
 if TYPE_CHECKING:
     from .adapter import MultiAdapter, T2IAdapter
-    from .autoencoders import AutoencoderKL
+    from .autoencoders import (
+        AsymmetricAutoencoderKL,
+        AutoencoderKL,
+        AutoencoderKLTemporalDecoder,
+        AutoencoderTiny,
+        ConsistencyDecoderVAE,
+    )
     from .controlnet import ControlNetModel
     from .embeddings import ImageProjection
     from .modeling_utils import ModelMixin
-    from .transformers import SD3Transformer2DModel, Transformer2DModel
-    from .unets import StableCascadeUNet, UNet1DModel, UNet2DConditionModel, UNet2DModel
+    from .transformers import (
+        DualTransformer2DModel,
+        PriorTransformer,
+        SD3Transformer2DModel,
+        T5FilmDecoder,
+        Transformer2DModel,
+        TransformerTemporalModel,
+    )
+    from .unets import (
+        I2VGenXLUNet,
+        Kandinsky3UNet,
+        MotionAdapter,
+        StableCascadeUNet,
+        UNet1DModel,
+        UNet2DConditionModel,
+        UNet2DModel,
+        UNet3DConditionModel,
+        UNetMotionModel,
+        UNetSpatioTemporalConditionModel,
+        UVit2DModel,
+    )
+    from .vq_model import VQModel
 
 else:
     import sys

diff --git a/mindone/diffusers/models/attention_processor.py b/mindone/diffusers/models/attention_processor.py
@@ -430,7 +430,7 @@ def get_attention_scores(self, query: ms.Tensor, key: ms.Tensor, attention_mask:
             )
         else:
             attention_scores = ops.baddbmm(
-                attention_mask,
+                attention_mask.to(query.dtype),
                 query,
                 key.swapaxes(-1, -2),
                 beta=1,
@@ -475,7 +475,9 @@ def prepare_attention_mask(
             #       we want to instead pad by (0, remaining_length), where remaining_length is:
             #       remaining_length: int = target_length - current_length
             # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
-            attention_mask = ops.pad(attention_mask, (0, target_length), value=0.0)
+            attention_mask = ops.Pad(paddings=((0, 0),) * (attention_mask.ndim - 1) + ((0, target_length),))(
+                attention_mask
+            )
 
         if out_dim == 3:
             if attention_mask.shape[0] < batch_size * head_size:

diff --git a/mindone/diffusers/models/autoencoders/__init__.py b/mindone/diffusers/models/autoencoders/__init__.py
@@ -1 +1,5 @@
+from .autoencoder_asym_kl import AsymmetricAutoencoderKL
 from .autoencoder_kl import AutoencoderKL
+from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
+from .autoencoder_tiny import AutoencoderTiny
+from .consistency_decoder_vae import ConsistencyDecoderVAE
diff --git a/mindone/diffusers/models/autoencoders/autoencoder_asym_kl.py b/mindone/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -0,0 +1,183 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+import mindspore as ms
+from mindspore import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
+
+
+class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
+    r"""
+    Designing a Better Asymmetric VQGAN for StableDiffusion https://arxiv.org/abs/2306.04632 . A VAE model with KL loss
+    for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        down_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of down block output channels.
+        layers_per_down_block (`int`, *optional*, defaults to `1`):
+            Number layers for down block.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        up_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of up block output channels.
+        layers_per_up_block (`int`, *optional*, defaults to `1`):
+            Number layers for up block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            Number of groups to use for the first normalization layer in ResNet blocks.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        down_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_down_block: int = 1,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        up_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_up_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+    ) -> None:
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=down_block_out_channels,
+            layers_per_block=layers_per_down_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = MaskConditionDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=up_block_out_channels,
+            layers_per_block=layers_per_up_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+        )
+        self.diag_gauss_dist = DiagonalGaussianDistribution()
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1, has_bias=True)
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1, has_bias=True)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        self.register_to_config(block_out_channels=up_block_out_channels)
+        self.register_to_config(force_upcast=False)
+
+    def encode(self, x: ms.Tensor, return_dict: bool = False) -> Union[AutoencoderKLOutput, Tuple[ms.Tensor]]:
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+
+        if not return_dict:
+            return (moments,)
+
+        return AutoencoderKLOutput(latent=moments)
+
+    def _decode(
+        self,
+        z: ms.Tensor,
+        image: Optional[ms.Tensor] = None,
+        mask: Optional[ms.Tensor] = None,
+        return_dict: bool = False,
+    ) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, image, mask)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def decode(
+        self,
+        z: ms.Tensor,
+        generator: Optional[np.random.Generator] = None,
+        image: Optional[ms.Tensor] = None,
+        mask: Optional[ms.Tensor] = None,
+        return_dict: bool = False,
+    ) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
+        decoded = self._decode(z, image, mask)[0]
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def construct(
+        self,
+        sample: ms.Tensor,
+        mask: Optional[ms.Tensor] = None,
+        sample_posterior: bool = False,
+        return_dict: bool = False,
+    ) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
+        r"""
+        Args:
+            sample (`ms.Tensor`): Input sample.
+            mask (`ms.Tensor`, *optional*, defaults to `None`): Optional inpainting mask.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        latent = self.encode(x)[0]
+        if sample_posterior:
+            z = self.diag_gauss_dist.sample(latent)
+        else:
+            z = self.diag_gauss_dist.mode(latent)
+
+        dec = self.decode(z, sample, mask)[0]
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)