Skip to content

Commit

Permalink
fix(diffusers/pipelines): sd gligen fixing
Browse files Browse the repository at this point in the history
  • Loading branch information
townwish4git committed Jul 1, 2024
1 parent 3b28f6e commit d8bf83b
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
1 change: 1 addition & 0 deletions mindone/diffusers/pipelines/pipeline_loading_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"MSPreTrainedModel": ["save_pretrained", "from_pretrained"],
"PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
"PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
"ProcessorMixin": ["save_pretrained", "from_pretrained"],
"ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ def get_clip_feature(self, input, normalize_constant, is_image=False):
inputs = self.tokenizer(input, return_tensors="np", padding=True)
for k, v in inputs.items():
inputs[k] = ms.Tensor.from_numpy(v)
outputs = self.text_encoder(**inputs)[0]
outputs = self.text_encoder(**inputs)
feature = outputs[1]
return feature

Expand Down Expand Up @@ -626,10 +626,10 @@ def get_cross_attention_kwargs_with_grounded(
boxes[idx] = ms.tensor(box)
masks[idx] = 1
if text_feature is not None:
phrases_embeddings[idx] = text_feature
phrases_embeddings[idx : idx + 1] = text_feature # unsqueeze for shape matching
phrases_masks[idx] = 1
if image_feature is not None:
image_embeddings[idx] = image_feature
image_embeddings[idx : idx + 1] = image_feature # unsqueeze for shape matching
image_masks[idx] = 1

input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs)
Expand Down Expand Up @@ -932,11 +932,16 @@ def __call__(
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
if latents.shape[1] != 4:
latents = ops.randn_like(latents[:, :4])
# mindspore.ops.randn_like(x) returns tensor with dtype float32, instead of x.dtype as torch does
latents = ops.randn_like(latents[:, :4], dtype=latents.dtype)

if gligen_inpaint_image is not None:
gligen_inpaint_latent_with_noise = (
self.scheduler.add_noise(gligen_inpaint_latent, ops.randn_like(gligen_inpaint_latent), t[None])
self.scheduler.add_noise(
gligen_inpaint_latent,
ops.randn_like(gligen_inpaint_latent, dtype=gligen_inpaint_latent.dtype),
t[None],
)
.broadcast_to((latents.shape[0], -1, -1, -1))
.copy()
)
Expand Down

0 comments on commit d8bf83b

Please sign in to comment.