PaddlePaddle · chang-wenbin · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/ppdiffusers/examples/inference/class_conditional_image_generation-dit.py b/ppdiffusers/examples/inference/class_conditional_image_generation-dit.py
@@ -17,7 +17,7 @@
 
 from ppdiffusers import DDIMScheduler, DiTPipeline
 
-dtype = paddle.float32
+dtype = paddle.float16
 pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", paddle_dtype=dtype)
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
 set_seed(42)

diff --git a/ppdiffusers/ppdiffusers/models/activations.py b/ppdiffusers/ppdiffusers/models/activations.py
@@ -17,6 +17,7 @@
 import paddle.nn.functional as F
 from paddle import nn
 
+from paddle.framework import LayerHelper, in_dynamic_mode
 from ..utils import USE_PEFT_BACKEND
 from .lora import LoRACompatibleLinear
 
@@ -64,9 +65,63 @@ def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
     def gelu(self, gate: paddle.Tensor) -> paddle.Tensor:
         return F.gelu(gate, approximate=self.approximate != "none")
 
+    def compute_activation(self,
+                            ffn1_out,
+                            bias=None,
+                            dequant_scales=None,
+                            shift=None,
+                            smooth=None,
+                            act_method="swiglu",
+                            compute_dtype="default",
+                            quant_scale=-1,
+                            quant_round_type=0,
+                            quant_max_bound=0,
+                            quant_min_bound=0):
+        if in_dynamic_mode():
+            out = paddle._C_ops.fused_bias_act(
+                ffn1_out,
+                bias,
+                dequant_scales,
+                shift,
+                smooth,
+                act_method,
+                compute_dtype,
+                quant_scale,
+                quant_round_type,
+                quant_max_bound,
+                quant_min_bound
+            )
+            return out
+
+        helper = LayerHelper("fused_bias_act")
+        out = helper.create_variable_for_type_inference(dtype=ffn1_out.dtype)
+        inputs = {}
+        inputs["x"] = ffn1_out
+        if bias is not None:
+            inputs["bias"] = bias
+        attrs = {
+            "act_method": act_method,
+            "compute_dtype": compute_dtype,
+            "quant_scale": quant_scale,
+            "quant_round_type": quant_round_type,
+            "quant_max_bound": quant_max_bound,
+            "quant_min_bound": quant_min_bound,
+        }
+        helper.append_op(
+            type="fused_bias_act",
+            inputs=inputs,
+            outputs={"out": out},
+            attrs=attrs,
+        )
+        return out
+
     def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        hidden_states = self.gelu(hidden_states)
+        # hidden_states = self.proj(hidden_states)
+        # hidden_states = self.gelu(hidden_states)
+        # out = paddle._C_ops.fused_bias_act()
+        hidden_states = paddle.matmul(hidden_states, self.proj.weight)
+        hidden_states = self.compute_activation(hidden_states, self.proj.bias, act_method="gelu")
+
         return hidden_states
 
 

diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
@@ -23,7 +23,8 @@
 from .embeddings import SinusoidalPositionalEmbedding
 from .lora import LoRACompatibleLinear
 from .normalization import AdaLayerNorm, AdaLayerNormZero
-
+from paddle.framework import LayerHelper, in_dynamic_mode
+from paddle.incubate.tt import adaptive_layer_norm, fused_adaLN_scale_residual
 
 def _chunked_feed_forward(
     ff: nn.Layer, hidden_states: paddle.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
@@ -227,6 +228,7 @@ def __init__(
                 {} if norm_elementwise_affine else dict(weight_attr=False, bias_attr=False)
             )
             self.norm3 = nn.LayerNorm(dim, epsilon=norm_eps, **norm_elementwise_affine_kwargs)
+            self.epsilon = norm_eps
 
         self.ff = FeedForward(
             dim,
@@ -300,12 +302,12 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        elif self.use_ada_layer_norm_single:
-            attn_output = gate_msa * attn_output
+        # if self.use_ada_layer_norm_zero:
+        #     attn_output = gate_msa.unsqueeze(1) * attn_output 
+        # elif self.use_ada_layer_norm_single:
+        #     attn_output = gate_msa * attn_output
 
-        hidden_states = attn_output + hidden_states
+        # hidden_states = attn_output + hidden_states   
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
 
@@ -338,11 +340,15 @@ def forward(
             hidden_states = attn_output + hidden_states
 
         # 4. Feed-forward
-        if not self.use_ada_layer_norm_single:
-            norm_hidden_states = self.norm3(hidden_states)
+        # if not self.use_ada_layer_norm_single:
+        #     norm_hidden_states = self.norm3(hidden_states)
 
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        # if self.use_ada_layer_norm_zero:
+        #     norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        # add triton adaptive_layer_norm pass;
+        # norm_hidden_states = paddle.incubate.tt.adaptive_layer_norm(hidden_states, scale_mlp, shift_mlp, epsilon=self.epsilon)
+
+        hidden_states, norm_hidden_states = fused_adaLN_scale_residual(hidden_states, attn_output, gate_msa, scale_mlp, shift_mlp, epsilon=self.epsilon)
 
         if self.use_ada_layer_norm_single:
             norm_hidden_states = self.norm2(hidden_states)

diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -26,6 +26,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+pass_fusion=True
 
 @maybe_allow_in_graph
 class Attention(nn.Layer):
@@ -172,20 +173,26 @@ def __init__(
         else:
             linear_cls = LoRACompatibleLinear
 
-        self.to_q = linear_cls(query_dim, self.inner_dim, bias_attr=bias)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
-            self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
+        # @ AIbin
+        # add kqv_fusion pass start
+        if pass_fusion is True :
+            # print("Passing fusion!")
+            self.to_qkv = linear_cls(query_dim, 3 * self.inner_dim, bias_attr=bias)
         else:
-            self.to_k = None
-            self.to_v = None
-
+            self.to_q = linear_cls(query_dim, self.inner_dim, bias_attr=bias)
+            if not self.only_cross_attention:
+                # only relevant for the `AddedKVProcessor` classes
+                self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
+                self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
+            else:
+                self.to_k = None
+                self.to_v = None
+
         if self.added_kv_proj_dim is not None:
             self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
             self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
-
+        # @ AIbin
+        # add kqv_fusion pass end
         self.to_out = nn.LayerList([])
         self.to_out.append(linear_cls(self.inner_dim, query_dim, bias_attr=out_bias))
         self.to_out.append(nn.Dropout(dropout))
@@ -1038,16 +1045,18 @@ def __call__(
 
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states, *args)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+
+        if pass_fusion is True:
+            qkv = attn.to_qkv(hidden_states, *args)
+            query, key, value = paddle.split(qkv, 3, -1)
+        else:
+            query = attn.to_q(hidden_states, *args)
+            if encoder_hidden_states is None:
+                encoder_hidden_states = hidden_states
+            elif attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            key = attn.to_k(encoder_hidden_states, *args)
+            value = attn.to_v(encoder_hidden_states, *args)
 
         query = attn.head_to_batch_dim(query, transpose=False)
         key = attn.head_to_batch_dim(key, transpose=False)

diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
@@ -291,6 +291,24 @@ def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOu
 
         return DecoderOutput(sample=dec)
 
+    @paddle.incubate.jit.inference(
+                                   cache_static_model=True,
+                                #    with_trt=False,
+                                #    exp_disable_tensorrt_ops=["memory_efficient_attention"],
+                                #    collect_shape=False,
+                                #    trt_precision_mode="float16",
+                                   )
+    def haha(self, z):
+        # TODO junnyu, add this to support pure fp16
+        z = z.cast(self.post_quant_conv.weight.dtype)
+        if self.use_slicing and z.shape[0] > 1:
+            # split、chunk paddle vs pytorch may have some difference
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
+            decoded = paddle.concat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        return decoded
+
     @apply_forward_hook
     def decode(
         self, z: paddle.Tensor, return_dict: bool = True, generator=None
@@ -309,15 +327,16 @@ def decode(
                 returned.
 
         """
-        # TODO junnyu, add this to support pure fp16
-        z = z.cast(self.post_quant_conv.weight.dtype)
-        if self.use_slicing and z.shape[0] > 1:
-            # split、chunk paddle vs pytorch may have some difference
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
-            decoded = paddle.concat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-
+        # # TODO junnyu, add this to support pure fp16
+        # z = z.cast(self.post_quant_conv.weight.dtype)
+        # if self.use_slicing and z.shape[0] > 1:
+        #     # split、chunk paddle vs pytorch may have some difference
+        #     decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
+        #     decoded = paddle.concat(decoded_slices)
+        # else:
+        #     decoded = self._decode(z).sample
+
+        decoded = self.haha(z)
         if not return_dict:
             return (decoded,)
 

diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
@@ -1050,6 +1050,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         return model
 
+    @classmethod
+    def custom_modify_weight(cls, state_dict):
+        pass
+
     @classmethod
     def _load_pretrained_model(
         cls,
@@ -1130,6 +1134,7 @@ def _find_mismatched_keys(
                     error_msgs.append(
                         f"Error size mismatch, {key_name} receives a shape {loaded_shape}, but the expected shape is {model_shape}."
                     )
+                cls.custom_modify_weight(state_dict)
                 faster_set_state_dict(model_to_load, state_dict)
 
         missing_keys = sorted(list(set(expected_keys) - set(loaded_keys)))

diff --git a/ppdiffusers/ppdiffusers/models/normalization.py b/ppdiffusers/ppdiffusers/models/normalization.py
@@ -20,7 +20,7 @@
 
 from .activations import get_activation
 from .embeddings import CombinedTimestepLabelEmbeddings, CombinedTimestepSizeEmbeddings
-
+from paddle.incubate.tt import adaptive_layer_norm, fused_adaLN_scale_residual
 
 class AdaLayerNorm(nn.Layer):
     r"""
@@ -74,7 +74,8 @@ def forward(
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        # x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        x = paddle.incubate.tt.adaptive_layer_norm(x, scale_msa, shift_msa,self.norm.weight,self.norm.bias)
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp