fix(diffusers/pipelines): sd gligen fixing

mindspore-lab · Jul 1, 2024 · d8bf83b · d8bf83b
1 parent 3b28f6e
commit d8bf83b
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 5 deletions.
diff --git a/mindone/diffusers/pipelines/pipeline_loading_utils.py b/mindone/diffusers/pipelines/pipeline_loading_utils.py
@@ -60,6 +60,7 @@
         "MSPreTrainedModel": ["save_pretrained", "from_pretrained"],
         "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
         "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
         "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
     },
 }

diff --git a/...iffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/...iffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
@@ -585,7 +585,7 @@ def get_clip_feature(self, input, normalize_constant, is_image=False):
             inputs = self.tokenizer(input, return_tensors="np", padding=True)
             for k, v in inputs.items():
                 inputs[k] = ms.Tensor.from_numpy(v)
-            outputs = self.text_encoder(**inputs)[0]
+            outputs = self.text_encoder(**inputs)
             feature = outputs[1]
         return feature
 
@@ -626,10 +626,10 @@ def get_cross_attention_kwargs_with_grounded(
             boxes[idx] = ms.tensor(box)
             masks[idx] = 1
             if text_feature is not None:
-                phrases_embeddings[idx] = text_feature
+                phrases_embeddings[idx : idx + 1] = text_feature  # unsqueeze for shape matching
                 phrases_masks[idx] = 1
             if image_feature is not None:
-                image_embeddings[idx] = image_feature
+                image_embeddings[idx : idx + 1] = image_feature  # unsqueeze for shape matching
                 image_masks[idx] = 1
 
         input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs)
@@ -932,11 +932,16 @@ def __call__(
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if latents.shape[1] != 4:
-                    latents = ops.randn_like(latents[:, :4])
+                    # mindspore.ops.randn_like(x) returns tensor with dtype float32, instead of x.dtype as torch does
+                    latents = ops.randn_like(latents[:, :4], dtype=latents.dtype)
 
                 if gligen_inpaint_image is not None:
                     gligen_inpaint_latent_with_noise = (
-                        self.scheduler.add_noise(gligen_inpaint_latent, ops.randn_like(gligen_inpaint_latent), t[None])
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent,
+                            ops.randn_like(gligen_inpaint_latent, dtype=gligen_inpaint_latent.dtype),
+                            t[None],
+                        )
                         .broadcast_to((latents.shape[0], -1, -1, -1))
                         .copy()
                     )