Skip to content

Commit

Permalink
feat(diffusers/models): add models
Browse files Browse the repository at this point in the history
  • Loading branch information
townwish4git committed Jun 20, 2024
1 parent 6271bc5 commit 2a6bbde
Show file tree
Hide file tree
Showing 35 changed files with 11,923 additions and 511 deletions.
16 changes: 2 additions & 14 deletions mindone/diffusers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,20 +97,7 @@ Most base, utility and mixin class are available.
- [ ] StableDiffusionPipeline

### Model

#### AutoEncoders

- [x] AutoencoderKL

#### UNets

- [x] UNet1DModel
- [x] UNet2DConditionModel
- [x] StableCascadeUNet

#### Transformers

- [x] Transformer2DModel
- All Supported

### Scheduler
- [x] DDIMScheduler/DDPMScheduler/...(30)
Expand All @@ -134,6 +121,7 @@ Most base, utility and mixin class are available.
Unlike the output `posterior = DiagonalGaussianDistribution(latent)`, which can do sampling by `posterior.sample()`.
We can only output the `latent` and then do sampling through `AutoencoderKL.diag_gauss_dist.sample(latent)`.


## Credits

Hacked together @geniuspatrick.
Expand Down
38 changes: 37 additions & 1 deletion mindone/diffusers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,31 @@
_import_structure = {
"configuration_utils": ["ConfigMixin"],
"models": [
"AsymmetricAutoencoderKL",
"AutoencoderKL",
"AutoencoderKLTemporalDecoder",
"AutoencoderTiny",
"ConsistencyDecoderVAE",
"ControlNetModel",
"I2VGenXLUNet",
"Kandinsky3UNet",
"ModelMixin",
"MotionAdapter",
"MultiAdapter",
"PriorTransformer",
"T2IAdapter",
"T5FilmDecoder",
"Transformer2DModel",
"SD3Transformer2DModel",
"StableCascadeUNet",
"UNet1DModel",
"UNet2DConditionModel",
"UNet2DModel",
"StableCascadeUNet",
"UNet3DConditionModel",
"UNetMotionModel",
"UNetSpatioTemporalConditionModel",
"UVit2DModel",
"VQModel",
],
"optimization": [
"get_constant_schedule",
Expand Down Expand Up @@ -80,13 +98,31 @@
if TYPE_CHECKING:
from .configuration_utils import ConfigMixin
from .models import (
AsymmetricAutoencoderKL,
AutoencoderKL,
AutoencoderKLTemporalDecoder,
AutoencoderTiny,
ConsistencyDecoderVAE,
ControlNetModel,
I2VGenXLUNet,
Kandinsky3UNet,
ModelMixin,
MotionAdapter,
MultiAdapter,
PriorTransformer,
SD3Transformer2DModel,
StableCascadeUNet,
T2IAdapter,
T5FilmDecoder,
Transformer2DModel,
UNet1DModel,
UNet2DConditionModel,
UNet2DModel,
UNet3DConditionModel,
UNetMotionModel,
UNetSpatioTemporalConditionModel,
UVit2DModel,
VQModel,
)
from .optimization import (
get_constant_schedule,
Expand Down
47 changes: 44 additions & 3 deletions mindone/diffusers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,67 @@

_import_structure = {
"adapter": ["MultiAdapter", "T2IAdapter"],
"autoencoders.autoencoder_asym_kl": ["AsymmetricAutoencoderKL"],
"autoencoders.autoencoder_kl": ["AutoencoderKL"],
"autoencoders.autoencoder_kl_temporal_decoder": ["AutoencoderKLTemporalDecoder"],
"autoencoders.autoencoder_tiny": ["AutoencoderTiny"],
"autoencoders.consistency_decoder_vae": ["ConsistencyDecoderVAE"],
"controlnet": ["ControlNetModel"],
"dual_transformer_2d": ["DualTransformer2DModel"],
"embeddings": ["ImageProjection"],
"modeling_utils": ["ModelMixin"],
"transformers.prior_transformer": ["PriorTransformer"],
"transformers.t5_film_transformer": ["T5FilmDecoder"],
"transformers.transformer_2d": ["Transformer2DModel"],
"transformers.transformer_temporal": ["TransformerTemporalModel"],
"transformers.transformer_sd3": ["SD3Transformer2DModel"],
"unets.unet_1d": ["UNet1DModel"],
"unets.unet_2d": ["UNet2DModel"],
"unets.unet_2d_condition": ["UNet2DConditionModel"],
"unets.unet_3d_condition": ["UNet3DConditionModel"],
"unets.unet_i2vgen_xl": ["I2VGenXLUNet"],
"unets.unet_kandinsky3": ["Kandinsky3UNet"],
"unets.unet_motion_model": ["MotionAdapter", "UNetMotionModel"],
"unets.unet_stable_cascade": ["StableCascadeUNet"],
"unets.unet_spatio_temporal_condition": ["UNetSpatioTemporalConditionModel"],
"unets.uvit_2d": ["UVit2DModel"],
"vq_model": ["VQModel"],
}

if TYPE_CHECKING:
from .adapter import MultiAdapter, T2IAdapter
from .autoencoders import AutoencoderKL
from .autoencoders import (
AsymmetricAutoencoderKL,
AutoencoderKL,
AutoencoderKLTemporalDecoder,
AutoencoderTiny,
ConsistencyDecoderVAE,
)
from .controlnet import ControlNetModel
from .embeddings import ImageProjection
from .modeling_utils import ModelMixin
from .transformers import SD3Transformer2DModel, Transformer2DModel
from .unets import StableCascadeUNet, UNet1DModel, UNet2DConditionModel, UNet2DModel
from .transformers import (
DualTransformer2DModel,
PriorTransformer,
SD3Transformer2DModel,
T5FilmDecoder,
Transformer2DModel,
TransformerTemporalModel,
)
from .unets import (
I2VGenXLUNet,
Kandinsky3UNet,
MotionAdapter,
StableCascadeUNet,
UNet1DModel,
UNet2DConditionModel,
UNet2DModel,
UNet3DConditionModel,
UNetMotionModel,
UNetSpatioTemporalConditionModel,
UVit2DModel,
)
from .vq_model import VQModel

else:
import sys
Expand Down
6 changes: 4 additions & 2 deletions mindone/diffusers/models/attention_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def get_attention_scores(self, query: ms.Tensor, key: ms.Tensor, attention_mask:
)
else:
attention_scores = ops.baddbmm(
attention_mask,
attention_mask.to(query.dtype),
query,
key.swapaxes(-1, -2),
beta=1,
Expand Down Expand Up @@ -475,7 +475,9 @@ def prepare_attention_mask(
# we want to instead pad by (0, remaining_length), where remaining_length is:
# remaining_length: int = target_length - current_length
# TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
attention_mask = ops.pad(attention_mask, (0, target_length), value=0.0)
attention_mask = ops.Pad(paddings=((0, 0),) * (attention_mask.ndim - 1) + ((0, target_length),))(
attention_mask
)

if out_dim == 3:
if attention_mask.shape[0] < batch_size * head_size:
Expand Down
4 changes: 4 additions & 0 deletions mindone/diffusers/models/autoencoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
from .autoencoder_asym_kl import AsymmetricAutoencoderKL
from .autoencoder_kl import AutoencoderKL
from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
from .autoencoder_tiny import AutoencoderTiny
from .consistency_decoder_vae import ConsistencyDecoderVAE
183 changes: 183 additions & 0 deletions mindone/diffusers/models/autoencoders/autoencoder_asym_kl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple, Union

import numpy as np

import mindspore as ms
from mindspore import nn

from ...configuration_utils import ConfigMixin, register_to_config
from ..modeling_outputs import AutoencoderKLOutput
from ..modeling_utils import ModelMixin
from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder


class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
r"""
Designing a Better Asymmetric VQGAN for StableDiffusion https://arxiv.org/abs/2306.04632 . A VAE model with KL loss
for encoding images into latents and decoding latent representations into images.
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
for all models (such as downloading or saving).
Parameters:
in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
out_channels (int, *optional*, defaults to 3): Number of channels in the output.
down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
Tuple of downsample block types.
down_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
Tuple of down block output channels.
layers_per_down_block (`int`, *optional*, defaults to `1`):
Number layers for down block.
up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
Tuple of upsample block types.
up_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
Tuple of up block output channels.
layers_per_up_block (`int`, *optional*, defaults to `1`):
Number layers for up block.
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
sample_size (`int`, *optional*, defaults to `32`): Sample input size.
norm_num_groups (`int`, *optional*, defaults to `32`):
Number of groups to use for the first normalization layer in ResNet blocks.
scaling_factor (`float`, *optional*, defaults to 0.18215):
The component-wise standard deviation of the trained latent space computed using the first batch of the
training set. This is used to scale the latent space to have unit variance when training the diffusion
model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
/ scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
"""

@register_to_config
def __init__(
self,
in_channels: int = 3,
out_channels: int = 3,
down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
down_block_out_channels: Tuple[int, ...] = (64,),
layers_per_down_block: int = 1,
up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
up_block_out_channels: Tuple[int, ...] = (64,),
layers_per_up_block: int = 1,
act_fn: str = "silu",
latent_channels: int = 4,
norm_num_groups: int = 32,
sample_size: int = 32,
scaling_factor: float = 0.18215,
) -> None:
super().__init__()

# pass init params to Encoder
self.encoder = Encoder(
in_channels=in_channels,
out_channels=latent_channels,
down_block_types=down_block_types,
block_out_channels=down_block_out_channels,
layers_per_block=layers_per_down_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
double_z=True,
)

# pass init params to Decoder
self.decoder = MaskConditionDecoder(
in_channels=latent_channels,
out_channels=out_channels,
up_block_types=up_block_types,
block_out_channels=up_block_out_channels,
layers_per_block=layers_per_up_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
)
self.diag_gauss_dist = DiagonalGaussianDistribution()

self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1, has_bias=True)
self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1, has_bias=True)

self.use_slicing = False
self.use_tiling = False

self.register_to_config(block_out_channels=up_block_out_channels)
self.register_to_config(force_upcast=False)

def encode(self, x: ms.Tensor, return_dict: bool = False) -> Union[AutoencoderKLOutput, Tuple[ms.Tensor]]:
h = self.encoder(x)
moments = self.quant_conv(h)

if not return_dict:
return (moments,)

return AutoencoderKLOutput(latent=moments)

def _decode(
self,
z: ms.Tensor,
image: Optional[ms.Tensor] = None,
mask: Optional[ms.Tensor] = None,
return_dict: bool = False,
) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
z = self.post_quant_conv(z)
dec = self.decoder(z, image, mask)

if not return_dict:
return (dec,)

return DecoderOutput(sample=dec)

def decode(
self,
z: ms.Tensor,
generator: Optional[np.random.Generator] = None,
image: Optional[ms.Tensor] = None,
mask: Optional[ms.Tensor] = None,
return_dict: bool = False,
) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
decoded = self._decode(z, image, mask)[0]

if not return_dict:
return (decoded,)

return DecoderOutput(sample=decoded)

def construct(
self,
sample: ms.Tensor,
mask: Optional[ms.Tensor] = None,
sample_posterior: bool = False,
return_dict: bool = False,
) -> Union[DecoderOutput, Tuple[ms.Tensor]]:
r"""
Args:
sample (`ms.Tensor`): Input sample.
mask (`ms.Tensor`, *optional*, defaults to `None`): Optional inpainting mask.
sample_posterior (`bool`, *optional*, defaults to `False`):
Whether to sample from the posterior.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
"""
x = sample
latent = self.encode(x)[0]
if sample_posterior:
z = self.diag_gauss_dist.sample(latent)
else:
z = self.diag_gauss_dist.mode(latent)

dec = self.decode(z, sample, mask)[0]

if not return_dict:
return (dec,)

return DecoderOutput(sample=dec)
Loading

0 comments on commit 2a6bbde

Please sign in to comment.