From d8bf83b98955b27cb1dae7f3c7e082b0c8b219a1 Mon Sep 17 00:00:00 2001 From: townwish4git Date: Mon, 1 Jul 2024 16:13:36 +0800 Subject: [PATCH] fix(diffusers/pipelines): sd gligen fixing --- .../diffusers/pipelines/pipeline_loading_utils.py | 1 + ...pipeline_stable_diffusion_gligen_text_image.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/mindone/diffusers/pipelines/pipeline_loading_utils.py b/mindone/diffusers/pipelines/pipeline_loading_utils.py index 6af381d6f1..7a1a93b374 100644 --- a/mindone/diffusers/pipelines/pipeline_loading_utils.py +++ b/mindone/diffusers/pipelines/pipeline_loading_utils.py @@ -60,6 +60,7 @@ "MSPreTrainedModel": ["save_pretrained", "from_pretrained"], "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"], "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"], + "ProcessorMixin": ["save_pretrained", "from_pretrained"], "ImageProcessingMixin": ["save_pretrained", "from_pretrained"], }, } diff --git a/mindone/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/mindone/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index 12f3fabd34..678e1e85d3 100644 --- a/mindone/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/mindone/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -585,7 +585,7 @@ def get_clip_feature(self, input, normalize_constant, is_image=False): inputs = self.tokenizer(input, return_tensors="np", padding=True) for k, v in inputs.items(): inputs[k] = ms.Tensor.from_numpy(v) - outputs = self.text_encoder(**inputs)[0] + outputs = self.text_encoder(**inputs) feature = outputs[1] return feature @@ -626,10 +626,10 @@ def get_cross_attention_kwargs_with_grounded( boxes[idx] = ms.tensor(box) masks[idx] = 1 if text_feature is not None: - phrases_embeddings[idx] = text_feature + phrases_embeddings[idx : idx + 1] = text_feature # unsqueeze for shape matching phrases_masks[idx] = 1 if image_feature is not None: - image_embeddings[idx] = image_feature + image_embeddings[idx : idx + 1] = image_feature # unsqueeze for shape matching image_masks[idx] = 1 input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs) @@ -932,11 +932,16 @@ def __call__( with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): if latents.shape[1] != 4: - latents = ops.randn_like(latents[:, :4]) + # mindspore.ops.randn_like(x) returns tensor with dtype float32, instead of x.dtype as torch does + latents = ops.randn_like(latents[:, :4], dtype=latents.dtype) if gligen_inpaint_image is not None: gligen_inpaint_latent_with_noise = ( - self.scheduler.add_noise(gligen_inpaint_latent, ops.randn_like(gligen_inpaint_latent), t[None]) + self.scheduler.add_noise( + gligen_inpaint_latent, + ops.randn_like(gligen_inpaint_latent, dtype=gligen_inpaint_latent.dtype), + t[None], + ) .broadcast_to((latents.shape[0], -1, -1, -1)) .copy() )