From 27bd7f1e9f78dab5832687a38746f6e637096ed4 Mon Sep 17 00:00:00 2001 From: Pierre Chapuis Date: Thu, 28 Nov 2024 15:34:44 +0100 Subject: [PATCH] Tiled VAE: fix bug with pathologic size (tile size - overlap + 1) This fixes the error: > The size of tensor a (128) must match the size of tensor b (0) > at non-singleton dimension 0 --- .../latent_diffusion/auto_encoder.py | 4 ++-- .../latent_diffusion/test_autoencoders.py | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/refiners/foundationals/latent_diffusion/auto_encoder.py b/src/refiners/foundationals/latent_diffusion/auto_encoder.py index 89cbb868..6c9edfa0 100644 --- a/src/refiners/foundationals/latent_diffusion/auto_encoder.py +++ b/src/refiners/foundationals/latent_diffusion/auto_encoder.py @@ -415,8 +415,8 @@ def _generate_latent_tiles(size: _ImageSize, tile_size: _ImageSize, overlap: int """ tiles: list[_Tile] = [] - for x in range(0, size.width, tile_size.width - overlap): - for y in range(0, size.height, tile_size.height - overlap): + for x in range(0, max(size.width - overlap, 1), tile_size.width - overlap): + for y in range(0, max(size.height - overlap, 1), tile_size.height - overlap): tile = _Tile( top=max(0, y), left=max(0, x), diff --git a/tests/foundationals/latent_diffusion/test_autoencoders.py b/tests/foundationals/latent_diffusion/test_autoencoders.py index 561007cf..1dd52256 100644 --- a/tests/foundationals/latent_diffusion/test_autoencoders.py +++ b/tests/foundationals/latent_diffusion/test_autoencoders.py @@ -107,6 +107,29 @@ def test_tiled_autoencoder_rectangular_image(autoencoder: LatentDiffusionAutoenc ensure_similar_images(sample_image, result, min_psnr=37, min_ssim=0.985) +@no_grad() +@pytest.mark.parametrize("tile_w", [240, 242, 244, 254, 256, 258]) +def test_tiled_autoencoder_pathologic_sizes( + refiners_sd15_autoencoder: SD1Autoencoder, + sample_image: Image.Image, + test_device: torch.device, + tile_w: int, +): + # 242 is the true pathologic case, a tile just larger than (tile size - overlap). + # 242 * 4 = 968 = (128 - 8 + 1) * 8 + + autoencoder = refiners_sd15_autoencoder.to(device=test_device, dtype=torch.float32) + + sample_image = sample_image.crop((0, 0, tile_w, 400)) + sample_image = sample_image.resize((sample_image.width * 4, sample_image.height * 4)) + + with autoencoder.tiled_inference(sample_image, tile_size=(1024, 1024)): + encoded = autoencoder.tiled_image_to_latents(sample_image) + result = autoencoder.tiled_latents_to_image(encoded) + + ensure_similar_images(sample_image, result, min_psnr=37, min_ssim=0.985) + + def test_value_error_tile_encode_no_context(autoencoder: LatentDiffusionAutoencoder, sample_image: Image.Image) -> None: with pytest.raises(ValueError): autoencoder.tiled_image_to_latents(sample_image)