diff --git a/.github/workflows/run_tests_from_a_pr.yml b/.github/workflows/run_tests_from_a_pr.yml
new file mode 100644
index 000000000000..782c0db417ff
--- /dev/null
+++ b/.github/workflows/run_tests_from_a_pr.yml
@@ -0,0 +1,73 @@
+name: Check running SLOW tests from a PR (only GPU)
+
+on:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        default: 'diffusers/diffusers-pytorch-cuda'
+        description: 'Name of the Docker image'
+        required: true
+      branch: 
+        description: 'PR Branch to test on'
+        required: true
+      test:
+        description: 'Tests to run (e.g.: `tests/models`).'
+        required: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  IS_GITHUB_CI: "1"
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
+
+jobs:
+  run_tests:
+    name: "Run a test on our runner from a PR"
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
+    steps:
+      - name: Validate test files input
+        id: validate_test_files
+        env: 
+          PY_TEST: ${{ github.event.inputs.test }}
+        run: |
+          if [[ ! "$PY_TEST" =~ ^tests/ ]]; then
+            echo "Error: The input string must start with 'tests/'."
+            exit 1
+          fi
+          
+          if [[ ! "$PY_TEST" =~ ^tests/(models|pipelines) ]]; then
+            echo "Error: The input string must contain either 'models' or 'pipelines' after 'tests/'."
+            exit 1
+          fi
+          
+          if [[ "$PY_TEST" == *";"* ]]; then
+            echo "Error: The input string must not contain ';'."
+            exit 1
+          fi
+          echo "$PY_TEST"
+
+      - name: Checkout PR branch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+
+
+      - name: Install pytest 
+        run: | 
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install peft
+      
+      - name: Run tests
+        env: 
+            PY_TEST: ${{ github.event.inputs.test }}
+        run: |
+          pytest "$PY_TEST"
\ No newline at end of file
diff --git a/docker/diffusers-doc-builder/Dockerfile b/docker/diffusers-doc-builder/Dockerfile
index 6f5f6d633245..cd8fa66983c5 100644
--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -18,6 +18,8 @@ RUN apt install -y bash \
                    python3.10 \
                    python3-pip \
                    libgl1 \
+                   zip \
+                   wget \
                    python3.10-venv && \
     rm -rf /var/lib/apt/lists
 
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2f4651ba3417..29fb3241aad9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -59,6 +59,8 @@
     title: Distributed inference with multiple GPUs
   - local: using-diffusers/merge_loras
     title: Merge LoRAs
+  - local: using-diffusers/scheduler_features
+    title: Scheduler features
   - local: using-diffusers/callback
     title: Pipeline callbacks
   - local: using-diffusers/reusing_seeds
@@ -68,6 +70,10 @@
   - local: using-diffusers/weighted_prompts
     title: Prompt techniques
   title: Inference techniques
+- sections:
+  - local: advanced_inference/outpaint
+    title: Outpainting
+  title: Advanced inference
 - sections:
   - local: using-diffusers/sdxl
     title: Stable Diffusion XL
@@ -93,6 +99,8 @@
     title: Trajectory Consistency Distillation-LoRA
   - local: using-diffusers/svd
     title: Stable Video Diffusion
+  - local: using-diffusers/marigold_usage
+    title: Marigold Computer Vision
   title: Specific pipeline examples
 - sections:
   - local: training/overview
@@ -295,6 +303,8 @@
       title: Latent Diffusion
     - local: api/pipelines/ledits_pp
       title: LEDITS++
+    - local: api/pipelines/marigold
+      title: Marigold
     - local: api/pipelines/panorama
       title: MultiDiffusion
     - local: api/pipelines/musicldm
@@ -445,4 +455,4 @@
       title: Video Processor
     title: Internal classes
     isExpanded: false
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/advanced_inference/outpaint.md b/docs/source/en/advanced_inference/outpaint.md
new file mode 100644
index 000000000000..f3a7bd99d8fa
--- /dev/null
+++ b/docs/source/en/advanced_inference/outpaint.md
@@ -0,0 +1,231 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Outpainting
+
+Outpainting extends an image beyond its original boundaries, allowing you to add, replace, or modify visual elements in an image while preserving the original image. Like [inpainting](../using-diffusers/inpaint), you want to fill the white area (in this case, the area outside of the original image) with new visual elements while keeping the original image (represented by a mask of black pixels). There are a couple of ways to outpaint, such as with a [ControlNet](https://hf.co/blog/OzzyGT/outpainting-controlnet) or with [Differential Diffusion](https://hf.co/blog/OzzyGT/outpainting-differential-diffusion).
+
+This guide will show you how to outpaint with an inpainting model, ControlNet, and a ZoeDepth estimator.
+
+Before you begin, make sure you have the [controlnet_aux](https://github.com/huggingface/controlnet_aux) library installed so you can use the ZoeDepth estimator.
+
+```py
+!pip install -q controlnet_aux
+```
+
+## Image preparation
+
+Start by picking an image to outpaint with and remove the background with a Space like [BRIA-RMBG-1.4](https://hf.co/spaces/briaai/BRIA-RMBG-1.4).
+
+<iframe
+	src="https://briaai-bria-rmbg-1-4.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+For example, remove the background from this image of a pair of shoes.
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/original-jordan.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/no-background-jordan.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">background removed</figcaption>
+  </div>
+</div>
+
+[Stable Diffusion XL (SDXL)](../using-diffusers/sdxl) models work best with 1024x1024 images, but you can resize the image to any size as long as your hardware has enough memory to support it. The transparent background in the image should also be replaced with a white background. Create a function (like the one below) that scales and pastes the image onto a white background.
+
+```py
+import random
+
+import requests
+import torch
+from controlnet_aux import ZoeDetector
+from PIL import Image, ImageOps
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionXLControlNetPipeline,
+    StableDiffusionXLInpaintPipeline,
+)
+
+def scale_and_paste(original_image):
+    aspect_ratio = original_image.width / original_image.height
+
+    if original_image.width > original_image.height:
+        new_width = 1024
+        new_height = round(new_width / aspect_ratio)
+    else:
+        new_height = 1024
+        new_width = round(new_height * aspect_ratio)
+
+    resized_original = original_image.resize((new_width, new_height), Image.LANCZOS)
+    white_background = Image.new("RGBA", (1024, 1024), "white")
+    x = (1024 - new_width) // 2
+    y = (1024 - new_height) // 2
+    white_background.paste(resized_original, (x, y), resized_original)
+
+    return resized_original, white_background
+
+original_image = Image.open(
+    requests.get(
+        "https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/no-background-jordan.png",
+        stream=True,
+    ).raw
+).convert("RGBA")
+resized_img, white_bg_image = scale_and_paste(original_image)
+```
+
+To avoid adding unwanted extra details, use the ZoeDepth estimator to provide additional guidance during generation and to ensure the shoes remain consistent with the original image.
+
+```py
+zoe = ZoeDetector.from_pretrained("lllyasviel/Annotators")
+image_zoe = zoe(white_bg_image, detect_resolution=512, image_resolution=1024)
+image_zoe
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/zoedepth-jordan.png"/>
+</div>
+
+## Outpaint
+
+Once your image is ready, you can generate content in the white area around the shoes with [controlnet-inpaint-dreamer-sdxl](https://hf.co/destitech/controlnet-inpaint-dreamer-sdxl), a SDXL ControlNet trained for inpainting.
+
+Load the inpainting ControlNet, ZoeDepth model, VAE and pass them to the [`StableDiffusionXLControlNetPipeline`]. Then you can create an optional `generate_image` function (for convenience) to outpaint an initial image.
+
+```py
+controlnets = [
+    ControlNetModel.from_pretrained(
+        "destitech/controlnet-inpaint-dreamer-sdxl", torch_dtype=torch.float16, variant="fp16"
+    ),
+    ControlNetModel.from_pretrained(
+        "diffusers/controlnet-zoe-depth-sdxl-1.0", torch_dtype=torch.float16
+    ),
+]
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
+pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16", controlnet=controlnets, vae=vae
+).to("cuda")
+
+def generate_image(prompt, negative_prompt, inpaint_image, zoe_image, seed: int = None):
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+
+    image = pipeline(
+        prompt,
+        negative_prompt=negative_prompt,
+        image=[inpaint_image, zoe_image],
+        guidance_scale=6.5,
+        num_inference_steps=25,
+        generator=generator,
+        controlnet_conditioning_scale=[0.5, 0.8],
+        control_guidance_end=[0.9, 0.6],
+    ).images[0]
+
+    return image
+
+prompt = "nike air jordans on a basketball court"
+negative_prompt = ""
+
+temp_image = generate_image(prompt, negative_prompt, white_bg_image, image_zoe, 908097)
+```
+
+Paste the original image over the initial outpainted image. You'll improve the outpainted background in a later step.
+
+```py
+x = (1024 - resized_img.width) // 2
+y = (1024 - resized_img.height) // 2
+temp_image.paste(resized_img, (x, y), resized_img)
+temp_image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/initial-outpaint.png"/>
+</div>
+
+> [!TIP]
+> Now is a good time to free up some memory if you're running low!
+>
+> ```py
+> pipeline=None
+> torch.cuda.empty_cache()
+> ```
+
+Now that you have an initial outpainted image, load the [`StableDiffusionXLInpaintPipeline`] with the [RealVisXL](https://hf.co/SG161222/RealVisXL_V4.0) model to generate the final outpainted image with better quality.
+
+```py
+pipeline = StableDiffusionXLInpaintPipeline.from_pretrained(
+    "OzzyGT/RealVisXL_V4.0_inpainting",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    vae=vae,
+).to("cuda")
+```
+
+Prepare a mask for the final outpainted image. To create a more natural transition between the original image and the outpainted background, blur the mask to help it blend better.
+
+```py
+mask = Image.new("L", temp_image.size)
+mask.paste(resized_img.split()[3], (x, y))
+mask = ImageOps.invert(mask)
+final_mask = mask.point(lambda p: p > 128 and 255)
+mask_blurred = pipeline.mask_processor.blur(final_mask, blur_factor=20)
+mask_blurred
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/blurred-mask.png"/>
+</div>
+
+Create a better prompt and pass it to the `generate_outpaint` function to generate the final outpainted image. Again, paste the original image over the final outpainted background.
+
+```py
+def generate_outpaint(prompt, negative_prompt, image, mask, seed: int = None):
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+
+    image = pipeline(
+        prompt,
+        negative_prompt=negative_prompt,
+        image=image,
+        mask_image=mask,
+        guidance_scale=10.0,
+        strength=0.8,
+        num_inference_steps=30,
+        generator=generator,
+    ).images[0]
+
+    return image
+
+prompt = "high quality photo of nike air jordans on a basketball court, highly detailed"
+negative_prompt = ""
+
+final_image = generate_outpaint(prompt, negative_prompt, temp_image, mask_blurred, 7688778)
+x = (1024 - resized_img.width) // 2
+y = (1024 - resized_img.height) // 2
+final_image.paste(resized_img, (x, y), resized_img)
+final_image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/final-outpaint.png"/>
+</div>
diff --git a/docs/source/en/api/models/vq.md b/docs/source/en/api/models/vq.md
index a5ac6ba63e39..fa0631e6fe0b 100644
--- a/docs/source/en/api/models/vq.md
+++ b/docs/source/en/api/models/vq.md
@@ -24,4 +24,4 @@ The abstract from the paper is:
 
 ## VQEncoderOutput
 
-[[autodoc]] models.vq_model.VQEncoderOutput
+[[autodoc]] models.autoencoders.vq_model.VQEncoderOutput
diff --git a/docs/source/en/api/pipelines/i2vgenxl.md b/docs/source/en/api/pipelines/i2vgenxl.md
index cafffaac3bd6..cbb6be1176fd 100644
--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -47,6 +47,7 @@ Sample output with I2VGenXL:
 * Unlike SVD, it additionally accepts text prompts as inputs.
 * It can generate higher resolution videos.
 * When using the [`DDIMScheduler`] (which is default for this pipeline), less than 50 steps for inference leads to bad results.
+* This implementation is 1-stage variant of I2VGenXL. The main figure in the [I2VGen-XL](https://arxiv.org/abs/2311.04145) paper shows a 2-stage variant, however, 1-stage variant works well. See [this discussion](https://github.com/huggingface/diffusers/discussions/7952) for more details.
 
 ## I2VGenXLPipeline
 [[autodoc]] I2VGenXLPipeline
diff --git a/docs/source/en/api/pipelines/marigold.md b/docs/source/en/api/pipelines/marigold.md
new file mode 100644
index 000000000000..e235368eb047
--- /dev/null
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -0,0 +1,76 @@
+<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Marigold Pipelines for Computer Vision Tasks
+
+![marigold](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)
+
+Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en). 
+The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks. 
+Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above. 
+Later, 
+- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
+- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
+- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).
+
+The abstract from the paper is:
+
+*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*
+
+## Available Pipelines
+
+Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image. 
+Currently, the following tasks are implemented:
+
+| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
+|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
+| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
+| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |
+
+
+## Available Checkpoints
+
+The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization. 
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
+
+</Tip>
+
+<Tip warning={true}>
+
+Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`. 
+Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
+Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference). 
+Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`. 
+This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument. 
+
+</Tip>
+
+See also Marigold [usage examples](marigold_usage).
+
+## MarigoldDepthPipeline
+[[autodoc]] MarigoldDepthPipeline
+	- all
+	- __call__
+
+## MarigoldNormalsPipeline
+[[autodoc]] MarigoldNormalsPipeline
+	- all
+	- __call__
+
+## MarigoldDepthOutput
+[[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput
+
+## MarigoldNormalsOutput
+[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
\ No newline at end of file
diff --git a/docs/source/en/conceptual/philosophy.md b/docs/source/en/conceptual/philosophy.md
index 29df833f1b8d..c94b8513901a 100644
--- a/docs/source/en/conceptual/philosophy.md
+++ b/docs/source/en/conceptual/philosophy.md
@@ -70,7 +70,7 @@ The following design principles are followed:
 - Pipelines should be used **only** for inference.
 - Pipelines should be very readable, self-explanatory, and easy to tweak.
 - Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
-- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner).
+- Pipelines are **not** intended to be feature-complete user interfaces. For feature-complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner).
 - Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
 - Pipelines should be named after the task they are intended to solve.
 - In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
diff --git a/docs/source/en/using-diffusers/image_quality.md b/docs/source/en/using-diffusers/image_quality.md
index 8961f88b904d..c25fa1467edf 100644
--- a/docs/source/en/using-diffusers/image_quality.md
+++ b/docs/source/en/using-diffusers/image_quality.md
@@ -12,54 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # Controlling image quality
 
-The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
+The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
 
 This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
 
-## Lighting
-
-The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
-
-> [!TIP]
-> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
->
-> ```bash
-> --prediction_type="v_prediction"
-> ```
-
-For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
-
-* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
-* `timestep_spacing="trailing"` to start sampling from the last timestep
-
-Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-
-pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
-
-pipeline.scheduler = DDIMScheduler.from_config(
-    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-pipeline.to("cuda")
-prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
-generator = torch.Generator(device="cpu").manual_seed(23)
-image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
-  </div>
-</div>
-
 ## Details
 
 [FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
diff --git a/docs/source/en/using-diffusers/marigold_usage.md b/docs/source/en/using-diffusers/marigold_usage.md
new file mode 100644
index 000000000000..ebfa4eb8c444
--- /dev/null
+++ b/docs/source/en/using-diffusers/marigold_usage.md
@@ -0,0 +1,466 @@
+<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Marigold Pipelines for Computer Vision Tasks
+
+[Marigold](marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.
+
+This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos.
+
+Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
+Currently, the following tasks are implemented:
+
+| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
+|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
+| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
+| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |
+
+The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
+These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold).
+The original code can also be used to train new checkpoints.
+
+| Checkpoint                                                                                    | Modality | Comment                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0)                         | Depth    | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
+| [prs-eth/marigold-lcm-v1-0](https://huggingface.co/prs-eth/marigold-lcm-v1-0)                 | Depth    | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that.                                                                                                                                                                                                                                                                                                                           |
+| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)         | Normals  | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                              |
+| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals  | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                       |
+The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities.
+We showcase the predictions using the same input image of Albert Einstein generated by Midjourney.
+This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.
+
+<div class="flex gap-4" style="justify-content: center; width: 100%;">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://marigoldmonodepth.github.io/images/einstein.jpg"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Example input image for all Marigold pipelines
+    </figcaption>
+  </div>
+</div>
+
+### Depth Prediction Quick Start
+
+To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions:
+
+```python
+import diffusers
+import torch
+
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+).to("cuda")
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+depth = pipe(image)
+
+vis = pipe.image_processor.visualize_depth(depth.prediction)
+vis[0].save("einstein_depth.png")
+
+depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
+depth_16bit[0].save("einstein_depth_16bit.png")
+```
+
+The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image.
+With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
+The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
+Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth_16bit.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Predicted depth (16-bit PNG)
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Predicted depth visualization (Spectral)
+    </figcaption>
+  </div>
+</div>
+
+### Surface Normals Prediction Quick Start
+
+Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions:
+
+```python
+import diffusers
+import torch
+
+pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+    "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+).to("cuda")
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+normals = pipe(image)
+
+vis = pipe.image_processor.visualize_normals(normals.prediction)
+vis[0].save("einstein_normals.png")
+```
+
+The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image.
+The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference.
+Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
+Below is the visualized prediction:
+
+<div class="flex gap-4" style="justify-content: center; width: 100%;">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_normals.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Predicted surface normals visualization
+    </figcaption>
+  </div>
+</div>
+
+In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
+This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
+Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue.
+Points on the shoulders pointing up with a large `Y` promote green color.
+
+### Speeding up inference
+
+The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step.
+The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
+Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
+In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
+Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](autoencoder_tiny):
+
+```diff
+  import diffusers
+  import torch
+  
+  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+  ).to("cuda")
+  
++ pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
++     "madebyollin/taesd", torch_dtype=torch.float16
++ ).cuda()
+  
+  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+  depth = pipe(image)
+```
+
+As suggested in [Optimizations](torch2.0), adding `torch.compile` may squeeze extra performance depending on the target hardware:
+
+```diff
+  import diffusers
+  import torch
+  
+  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+  ).to("cuda")
+  
++ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+  
+  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+  depth = pipe(image)
+```
+
+## Qualitative Comparison with Depth Anything
+
+With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Marigold LCM fp16 with Tiny AutoEncoder
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/bfe7cb56ca1cc0811b328212472350879dfa7f8b/marigold/einstein_depthanything_large.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Depth Anything Large
+    </figcaption>
+  </div>
+</div>
+
+## Maximizing Precision and Ensembling
+
+Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents.
+This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
+The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`.
+When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`.
+The recommended values vary across checkpoints but primarily depend on the scheduler type.
+The effect of ensembling is particularly well-seen with surface normals:
+
+```python
+import diffusers
+
+model_path = "prs-eth/marigold-normals-v1-0"
+
+model_paper_kwargs = {
+	diffusers.schedulers.DDIMScheduler: {
+		"num_inference_steps": 10,
+		"ensemble_size": 10,
+	},
+	diffusers.schedulers.LCMScheduler: {
+		"num_inference_steps": 4,
+		"ensemble_size": 5,
+	},	
+}
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+
+pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
+pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
+
+depth = pipe(image, **pipe_kwargs)
+
+vis = pipe.image_processor.visualize_normals(depth.prediction)
+vis[0].save("einstein_normals.png")
+```
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_normals.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Surface normals, no ensembling
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_normals.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Surface normals, with ensembling
+    </figcaption>
+  </div>
+</div>
+
+As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
+Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.
+
+## Quantitative Evaluation
+
+To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`.
+Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
+
+```python
+import diffusers
+import torch
+
+device = "cuda"
+seed = 2024
+model_path = "prs-eth/marigold-v1-0"
+
+model_paper_kwargs = {
+	diffusers.schedulers.DDIMScheduler: {
+		"num_inference_steps": 50,
+		"ensemble_size": 10,
+	},
+	diffusers.schedulers.LCMScheduler: {
+		"num_inference_steps": 4,
+		"ensemble_size": 10,
+	},	
+}
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+
+generator = torch.Generator(device=device).manual_seed(seed)
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
+pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
+
+depth = pipe(image, generator=generator, **pipe_kwargs)
+
+# evaluate metrics
+```
+
+## Using Predictive Uncertainty
+
+The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents.
+As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
+The resulting uncertainty will be available in the `uncertainty` field of the output.
+It can be visualized as follows:
+
+```python
+import diffusers
+import torch
+
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+).to("cuda")
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+depth = pipe(
+	image,
+	ensemble_size=10,  # any number greater than 1; higher values yield higher precision
+	output_uncertainty=True,
+)
+
+uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
+uncertainty[0].save("einstein_depth_uncertainty.png")
+```
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_depth_uncertainty.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Depth uncertainty
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_normals_uncertainty.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Surface normals uncertainty
+    </figcaption>
+  </div>
+</div>
+
+The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
+Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
+The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
+
+## Frame-by-frame Video Processing with Temporal Consistency
+
+Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization.
+This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama.gif"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">Input video</figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_independent.gif"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
+  </div>
+</div>
+
+To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
+Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
+
+```python
+import imageio
+from PIL import Image
+from tqdm import tqdm
+import diffusers
+import torch
+
+device = "cuda"
+path_in = "obama.mp4"
+path_out = "obama_depth.gif"
+
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+    "prs-eth/marigold-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+).to(device)
+pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
+    "madebyollin/taesd", torch_dtype=torch.float16
+).to(device)
+pipe.set_progress_bar_config(disable=True)
+
+with imageio.get_reader(path_in) as reader:
+    size = reader.get_meta_data()['size']
+    last_frame_latent = None
+    latent_common = torch.randn(
+        (1, 4, 768 * size[1] // (8 * max(size)), 768 * size[0] // (8 * max(size)))
+    ).to(device=device, dtype=torch.float16)
+
+    out = []
+    for frame_id, frame in tqdm(enumerate(reader), desc="Processing Video"):
+        frame = Image.fromarray(frame)
+        latents = latent_common
+        if last_frame_latent is not None:
+            latents = 0.9 * latents + 0.1 * last_frame_latent
+
+        depth = pipe(
+			frame, match_input_resolution=False, latents=latents, output_latent=True
+        )
+        last_frame_latent = depth.latent
+        out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
+
+    diffusers.utils.export_to_gif(out, path_out, fps=reader.get_meta_data()['fps'])
+```
+
+Here, the diffusion process starts from the given computed latent.
+The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
+The result is much more stable now:
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_independent.gif"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_consistent.gif"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth with forced latents initialization</figcaption>
+  </div>
+</div>
+
+## Marigold for ControlNet
+
+A very common application for depth prediction with diffusion models comes in conjunction with ControlNet.
+Depth crispness plays a crucial role in obtaining high-quality results from ControlNet.
+As seen in comparisons with other methods above, Marigold excels at that task.
+The snippet below demonstrates how to load an image, compute depth, and pass it into ControlNet in a compatible format:
+
+```python
+import torch
+import diffusers
+
+device = "cuda"
+generator = torch.Generator(device=device).manual_seed(2024)
+image = diffusers.utils.load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_depth_source.png"
+)
+
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+    "prs-eth/marigold-lcm-v1-0", torch_dtype=torch.float16, variant="fp16"
+).to("cuda")
+
+depth_image = pipe(image, generator=generator).prediction
+depth_image = pipe.image_processor.visualize_depth(depth_image, color_map="binary")
+depth_image[0].save("motorcycle_controlnet_depth.png")
+
+controlnet = diffusers.ControlNetModel.from_pretrained(
+    "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
+).to("cuda")
+pipe = diffusers.StableDiffusionXLControlNetPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16", controlnet=controlnet
+).to("cuda")
+pipe.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
+
+controlnet_out = pipe(
+    prompt="high quality photo of a sports bike, city",
+    negative_prompt="",
+    guidance_scale=6.5,
+    num_inference_steps=25,
+    image=depth_image,
+    controlnet_conditioning_scale=0.7,
+    control_guidance_end=0.7,
+    generator=generator,
+).images
+controlnet_out[0].save("motorcycle_controlnet_out.png")
+```
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 33%; max-width: 33%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_depth_source.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Input image
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 33%; max-width: 33%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8e61e31f9feb7756c0404ceff26f3f0e5d3fe610/marigold/motorcycle_controlnet_depth.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Depth in the format compatible with ControlNet
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 33%; max-width: 33%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8e61e31f9feb7756c0404ceff26f3f0e5d3fe610/marigold/motorcycle_controlnet_out.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      ControlNet generation, conditioned on depth and prompt: "high quality photo of a sports bike, city"
+    </figcaption>
+  </div>
+</div>
+
+Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a perception task, such as 3D reconstruction.
diff --git a/docs/source/en/using-diffusers/scheduler_features.md b/docs/source/en/using-diffusers/scheduler_features.md
new file mode 100644
index 000000000000..445acdccc489
--- /dev/null
+++ b/docs/source/en/using-diffusers/scheduler_features.md
@@ -0,0 +1,235 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Scheduler features
+
+The scheduler is an important component of any diffusion model because it controls the entire denoising (or sampling) process. There are many types of schedulers, some are optimized for speed and some for quality. With Diffusers, you can modify the scheduler configuration to use custom noise schedules, sigmas, and rescale the noise schedule. Changing these parameters can have profound effects on inference quality and speed.
+
+This guide will demonstrate how to use these features to improve inference quality.
+
+> [!TIP]
+> Diffusers currently only supports the `timesteps` and `sigmas` parameters for a select list of schedulers and pipelines. Feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend these parameters to a scheduler and pipeline that does not currently support it!
+
+## Timestep schedules
+
+The timestep or noise schedule determines the amount of noise at each sampling step. The scheduler uses this to generate an image with the corresponding amount of noise at each step. The timestep schedule is generated from the scheduler's default configuration, but you can customize the scheduler to use new and optimized sampling schedules that aren't in Diffusers yet.
+
+For example, [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) is a method for optimizing a sampling schedule to generate a high-quality image in as little as 10 steps. The optimal [10-step schedule](https://github.com/huggingface/diffusers/blob/a7bf77fc284810483f1e60afe34d1d27ad91ce2e/src/diffusers/schedulers/scheduling_utils.py#L51) for Stable Diffusion XL is:
+
+```py
+from diffusers.schedulers import AysSchedules
+
+sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
+print(sampling_schedule)
+"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]"
+```
+
+You can use the AYS sampling schedule in a pipeline by passing it to the `timesteps` parameter.
+
+```py
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++")
+
+prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+    timesteps=sampling_schedule,
+).images[0]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
+  </div>
+</div>
+
+## Timestep spacing
+
+The way sample steps are selected in the schedule can affect the quality of the generated image, especially with respect to [rescaling the noise schedule](#rescale-noise-schedule), which can enable a model to generate much brighter or darker images. Diffusers provides three timestep spacing methods:
+
+- `leading` creates evenly spaced steps
+- `linspace` includes the first and last steps and evenly selects the remaining intermediate steps
+- `trailing` only includes the last step and evenly selects the remaining intermediate steps starting from the end
+
+It is recommended to use the `trailing` spacing method because it generates higher quality images with more details when there are fewer sample steps. But the difference in quality is not as obvious for more standard sample step values.
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing")
+
+prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+    num_inference_steps=5,
+).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/trailing_spacing.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">trailing spacing after 5 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/leading_spacing.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">leading spacing after 5 steps</figcaption>
+  </div>
+</div>
+
+## Sigmas
+
+The `sigmas` parameter is the amount of noise added at each timestep according to the timestep schedule. Like the `timesteps` parameter, you can customize the `sigmas` parameter to control how much noise is added at each step. When you use a custom `sigmas` value, the `timesteps` are calculated from the custom `sigmas` value and the default scheduler configuration is ignored.
+
+For example, you can manually pass the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) for something like the 10-step AYS schedule from before to the pipeline.
+
+```py
+import torch
+
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.float16,
+  variant="fp16",
+).to("cuda")
+pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0]
+prompt = "anthropomorphic capybara wearing a suit and working with a computer"
+generator = torch.Generator(device='cuda').manual_seed(123)
+image = pipeline(
+    prompt=prompt, 
+    num_inference_steps=10,
+    sigmas=sigmas,
+    generator=generator
+).images[0]
+```
+
+When you take a look at the scheduler's `timesteps` parameter, you'll see that it is the same as the AYS timestep schedule because the `timestep` schedule is calculated from the `sigmas`.
+
+```py
+print(f" timesteps: {pipe.scheduler.timesteps}")
+"timesteps: tensor([999., 845., 730., 587., 443., 310., 193., 116.,  53.,  13.], device='cuda:0')"
+```
+
+### Karras sigmas
+
+> [!TIP]
+> Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas.
+>
+> Karras sigmas should not be used for models that weren't trained with them. For example, the base Stable Diffusion XL model shouldn't use Karras sigmas but the [DreamShaperXL](https://hf.co/Lykon/dreamshaper-xl-1-0) model can since they are trained with Karras sigmas.
+
+Karras scheduler's use the timestep schedule and sigmas from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://hf.co/papers/2206.00364) paper. This scheduler variant applies a smaller amount of noise per step as it approaches the end of the sampling process compared to other schedulers, and can increase the level of details in the generated image.
+
+Enable Karras sigmas by setting `use_karras_sigmas=True` in the scheduler.
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)
+
+prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+).images[0]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_true.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas enabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_false.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas disabled</figcaption>
+  </div>
+</div>
+
+## Rescale noise schedule
+
+In the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper, the authors discovered that common noise schedules allowed some signal to leak into the last timestep. This signal leakage at inference can cause models to only generate images with medium brightness. By enforcing a zero signal-to-noise ratio (SNR) for the timstep schedule and sampling from the last timestep, the model can be improved to generate very bright or dark images.
+
+> [!TIP]
+> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
+>
+> ```bash
+> --prediction_type="v_prediction"
+> ```
+
+For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Configure the following parameters in the [`DDIMScheduler`]:
+
+* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
+* `timestep_spacing="trailing"` to start sampling from the last timestep
+
+Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
+
+```py
+from diffusers import DiffusionPipeline, DDIMScheduler
+
+pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
+
+pipeline.scheduler = DDIMScheduler.from_config(
+    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipeline.to("cuda")
+prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(23)
+image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
+  </div>
+</div>
diff --git a/docs/source/en/using-diffusers/schedulers.md b/docs/source/en/using-diffusers/schedulers.md
index bfc8aa1a2108..01dab2bed7fe 100644
--- a/docs/source/en/using-diffusers/schedulers.md
+++ b/docs/source/en/using-diffusers/schedulers.md
@@ -212,62 +212,6 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```
 
-## Custom Timestep Schedules
-
-With all our schedulers, you can choose one of the popular timestep schedules using configurations such as `timestep_spacing`, `interpolation_type`, and `use_karras_sigmas`. Some schedulers also provide the flexibility to use a custom timestep schedule. You can use any list of arbitrary timesteps, we will use the AYS timestep schedule here as example. It is a set of 10-step optimized timestep schedules released by researchers from Nvidia that can achieve significantly better quality compared to the preset timestep schedules. You can read more about their research [here](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/). 
-
-```python
-from diffusers.schedulers import AysSchedules
-sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
-print(sampling_schedule)
-```
-```
-[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]
-```
-
-You can then create a pipeline and pass this custom timestep schedule to it as `timesteps`.
-
-```python
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, algorithm_type="sde-dpmsolver++")
-
-prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
-
-generator = torch.Generator(device="cpu").manual_seed(2487854446)
-
-image = pipe(
-    prompt=prompt,
-    negative_prompt="",
-    generator=generator,
-    timesteps=sampling_schedule,
-).images[0]
-```
-The generated image has better quality than the default linear timestep schedule for the same number of steps, and it is similar to the default timestep scheduler when running for 25 steps.
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
-  </div>
-</div>
-
-> [!TIP]
-> 🤗 Diffusers currently only supports `timesteps` and `sigmas` for a selected list of schedulers and pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend feature to a scheduler and pipeline that does not currently support it!
-
-
 ## Models
 
 Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
index 143f67f9cb81..5ed00e14c14f 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
@@ -71,7 +71,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
index 8b5b00f3bd88..32e882acb56e 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -78,7 +78,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/community/kohya_hires_fix.py b/examples/community/kohya_hires_fix.py
new file mode 100644
index 000000000000..867d636c7cae
--- /dev/null
+++ b/examples/community/kohya_hires_fix.py
@@ -0,0 +1,468 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UNet2DConditionModelHighResFix(UNet2DConditionModel):
+    r"""
+    A conditional 2D UNet model that applies Kohya fix proposed for high resolution image generation.
+
+    This model inherits from [`UNet2DConditionModel`]. Check the superclass documentation for learning about all the parameters.
+
+    Parameters:
+        high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
+            Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(self, high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}], **kwargs):
+        super().__init__(**kwargs)
+        if high_res_fix:
+            self.config.high_res_fix = sorted(high_res_fix, key=lambda x: x["timestep"], reverse=True)
+
+    @classmethod
+    def _resize(cls, sample, target=None, scale_factor=1, mode="bicubic"):
+        dtype = sample.dtype
+        if dtype == torch.bfloat16:
+            sample = sample.to(torch.float32)
+
+        if target is not None:
+            if sample.shape[-2:] != target.shape[-2:]:
+                sample = nn.functional.interpolate(sample, size=target.shape[-2:], mode=mode, align_corners=False)
+        elif scale_factor != 1:
+            sample = nn.functional.interpolate(sample, scale_factor=scale_factor, mode=mode, align_corners=False)
+
+        return sample.to(dtype)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        if self.config.addition_embed_type == "image_hint":
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+        if cross_attention_kwargs is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for down_i, downsample_block in enumerate(self.down_blocks):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+            # kohya high res fix
+            if self.config.high_res_fix:
+                for high_res_fix in self.config.high_res_fix:
+                    if timestep > high_res_fix["timestep"] and down_i == high_res_fix["block_num"]:
+                        sample = self.__class__._resize(sample, scale_factor=high_res_fix["scale_factor"])
+                        break
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # up scaling of kohya high res fix
+            if self.config.high_res_fix is not None:
+                if res_samples[0].shape[-2:] != sample.shape[-2:]:
+                    sample = self.__class__._resize(sample, target=res_samples[0])
+                    res_samples_up_sampled = (res_samples[0],)
+                    for res_sample in res_samples[1:]:
+                        res_samples_up_sampled += (self.__class__._resize(res_sample, target=res_samples[0]),)
+                    res_samples = res_samples_up_sampled
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+    @classmethod
+    def from_unet(cls, unet: UNet2DConditionModel, high_res_fix: list):
+        config = dict((unet.config))
+        config["high_res_fix"] = high_res_fix
+        unet_high_res = cls(**config)
+        unet_high_res.load_state_dict(unet.state_dict())
+        unet_high_res.to(unet.dtype)
+        return unet_high_res
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+
+        >>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4",
+                                         custom_pipeline="kohya_hires_fix",
+                                         torch_dtype=torch.float16,
+                                         high_res_fix=[{'timestep': 600,
+                                                        'scale_factor': 0.5,
+                                                        'block_num': 1}])
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, height=1000, width=1600).images[0]
+        ```
+"""
+
+
+class StableDiffusionHighResFixPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Kohya fix for high resolution generation.
+
+    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods.
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
+            Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}],
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
+        )
+
+        unet = UNet2DConditionModelHighResFix.from_unet(unet=unet, high_res_fix=high_res_fix)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
diff --git a/examples/community/marigold_depth_estimation.py b/examples/community/marigold_depth_estimation.py
index ef1b45b942cc..0ed7107d6b5c 100644
--- a/examples/community/marigold_depth_estimation.py
+++ b/examples/community/marigold_depth_estimation.py
@@ -43,7 +43,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0")
+check_min_version("0.29.0.dev0")
 
 
 class MarigoldDepthOutput(BaseOutput):
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
index e7d934dd07a8..faf1ee5b5a1f 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
@@ -73,7 +73,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
index 56f83f47b84c..7754a8e08b87 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
@@ -66,7 +66,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
index ce3e7f624843..e72de4b7877d 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
@@ -79,7 +79,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_sd_wds.py
index 7fbcb5d6fb91..e80bb5571f6a 100644
--- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py
@@ -72,7 +72,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
index c8b91e7abdd6..96b323ff92f7 100644
--- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
@@ -78,7 +78,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index 3daca0e3f56b..61d97a4f1d55 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -60,7 +60,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index beb9d23fd750..85f7b57fbe47 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -60,7 +60,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py
index 288a1e3fb612..e09747fe5677 100644
--- a/examples/controlnet/train_controlnet_sdxl.py
+++ b/examples/controlnet/train_controlnet_sdxl.py
@@ -61,7 +61,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 if is_torch_npu_available():
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index fbd11fc01be2..7b8d9c7dbb10 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -63,7 +63,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 796aba87e846..ef4b7455cc7f 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -63,7 +63,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 23238c84b643..ff09ff20cd95 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -35,7 +35,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index f2cac3b1494d..25b720b654b6 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -70,7 +70,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py
index 9e3d82acdbfe..a6ffb9fa42ae 100644
--- a/examples/dreambooth/train_dreambooth_lora_sdxl.py
+++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -78,7 +78,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index f1125a2919f0..7517b7c5983d 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -57,7 +57,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
index 1c0cdf04b2d2..16043ac2701e 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -60,7 +60,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
index 409978cb5373..109e1a1c984a 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py
index eb8ae8cca060..af8327ec1b85 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
index f6f3896aaa12..f14fa627ab9e 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
index 54a4d0a397b4..377eb81e8212 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
index 50735ef044a6..d99c368910a0 100644
--- a/examples/t2i_adapter/train_t2i_adapter_sdxl.py
+++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -60,7 +60,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 13ee0f2cc4c7..bf2557e35f0c 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index c3a08a90b4e5..474cca3595e3 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 37b10cfd1bad..e493f89125f4 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -52,8 +52,11 @@
 from diffusers.utils.torch_utils import is_compiled_module
 
 
+if is_wandb_available():
+    import wandb
+
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
@@ -99,6 +102,48 @@ def save_model_card(
     model_card.save(os.path.join(repo_folder, "README.md"))
 
 
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    epoch,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device)
+    if args.seed is not None:
+        generator = generator.manual_seed(args.seed)
+    images = []
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        for _ in range(args.num_validation_images):
+            images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+    return images
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -414,11 +459,6 @@ def main():
     if torch.backends.mps.is_available():
         accelerator.native_amp = False
 
-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-        import wandb
-
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -864,10 +904,6 @@ def collate_fn(examples):
 
         if accelerator.is_main_process:
             if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
                 # create pipeline
                 pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
@@ -876,38 +912,7 @@ def collate_fn(examples):
                     variant=args.variant,
                     torch_dtype=weight_dtype,
                 )
-                pipeline = pipeline.to(accelerator.device)
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                generator = torch.Generator(device=accelerator.device)
-                if args.seed is not None:
-                    generator = generator.manual_seed(args.seed)
-                images = []
-                if torch.backends.mps.is_available():
-                    autocast_ctx = nullcontext()
-                else:
-                    autocast_ctx = torch.autocast(accelerator.device.type)
-
-                with autocast_ctx:
-                    for _ in range(args.num_validation_images):
-                        images.append(
-                            pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
-                        )
-
-                for tracker in accelerator.trackers:
-                    if tracker.name == "tensorboard":
-                        np_images = np.stack([np.asarray(img) for img in images])
-                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                    if tracker.name == "wandb":
-                        tracker.log(
-                            {
-                                "validation": [
-                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                    for i, image in enumerate(images)
-                                ]
-                            }
-                        )
+                images = log_validation(pipeline, args, accelerator, epoch)
 
                 del pipeline
                 torch.cuda.empty_cache()
@@ -925,21 +930,6 @@ def collate_fn(examples):
             safe_serialization=True,
         )
 
-        if args.push_to_hub:
-            save_model_card(
-                repo_id,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                dataset_name=args.dataset_name,
-                repo_folder=args.output_dir,
-            )
-            upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
-            )
-
         # Final inference
         # Load previous pipeline
         if args.validation_prompt is not None:
@@ -949,41 +939,27 @@ def collate_fn(examples):
                 variant=args.variant,
                 torch_dtype=weight_dtype,
             )
-            pipeline = pipeline.to(accelerator.device)
 
             # load attention processors
             pipeline.load_lora_weights(args.output_dir)
 
             # run inference
-            generator = torch.Generator(device=accelerator.device)
-            if args.seed is not None:
-                generator = generator.manual_seed(args.seed)
-            images = []
-            if torch.backends.mps.is_available():
-                autocast_ctx = nullcontext()
-            else:
-                autocast_ctx = torch.autocast(accelerator.device.type)
-
-            with autocast_ctx:
-                for _ in range(args.num_validation_images):
-                    images.append(
-                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
-                    )
+            images = log_validation(pipeline, args, accelerator, epoch, is_final_validation=True)
 
-            for tracker in accelerator.trackers:
-                if len(images) != 0:
-                    if tracker.name == "tensorboard":
-                        np_images = np.stack([np.asarray(img) for img in images])
-                        tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
-                    if tracker.name == "wandb":
-                        tracker.log(
-                            {
-                                "test": [
-                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                    for i, image in enumerate(images)
-                                ]
-                            }
-                        )
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
 
     accelerator.end_training()
 
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index c9883252d14b..077bee2e2f69 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -65,7 +65,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 if is_torch_npu_available():
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index 74864da20d82..19abbc9ec682 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 if is_torch_npu_available():
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 4922789862b5..cd19cae776ed 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -81,7 +81,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 8534336400ff..a62b36c4de3e 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -56,7 +56,7 @@
 # ------------------------------------------------------------------------------
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/textual_inversion/textual_inversion_sdxl.py b/examples/textual_inversion/textual_inversion_sdxl.py
index c24a4c4f4855..f13835d928a8 100644
--- a/examples/textual_inversion/textual_inversion_sdxl.py
+++ b/examples/textual_inversion/textual_inversion_sdxl.py
@@ -76,7 +76,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 9996a30713e0..c1f37eae21d6 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -29,7 +29,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/vqgan/train_vqgan.py b/examples/vqgan/train_vqgan.py
index b7beee1f3b26..e33eef777264 100644
--- a/examples/vqgan/train_vqgan.py
+++ b/examples/vqgan/train_vqgan.py
@@ -50,7 +50,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
index 79f7d8576ff4..44fbee463b9f 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
index 3e0acfdaf519..1f015b0b2005 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.29.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/setup.py b/setup.py
index 943238df765d..598291fa4546 100644
--- a/setup.py
+++ b/setup.py
@@ -254,14 +254,14 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.28.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.29.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="State-of-the-art diffusion in PyTorch and JAX.",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
     keywords="deep learning diffusion jax pytorch stable diffusion audioldm",
     license="Apache 2.0 License",
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/diffusers/graphs/contributors)",
-    author_email="patrick@huggingface.co",
+    author_email="diffusers@huggingface.co",
     url="https://github.com/huggingface/diffusers",
     package_dir={"": "src"},
     packages=find_packages("src"),
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 66c98804eadc..9262fdb1d871 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.28.0.dev0"
+__version__ = "0.29.0.dev0"
 
 from typing import TYPE_CHECKING
 
@@ -259,6 +259,8 @@
             "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
+            "MarigoldDepthPipeline",
+            "MarigoldNormalsPipeline",
             "MusicLDMPipeline",
             "PaintByExamplePipeline",
             "PIAPipeline",
@@ -637,6 +639,8 @@
             LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
+            MarigoldDepthPipeline,
+            MarigoldNormalsPipeline,
             MusicLDMPipeline,
             PaintByExamplePipeline,
             PIAPipeline,
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
index 11e3311a6402..e233c916f919 100644
--- a/src/diffusers/loaders/lora_conversion_utils.py
+++ b/src/diffusers/loaders/lora_conversion_utils.py
@@ -226,6 +226,8 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
             diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
             diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
             diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            diffusers_name = diffusers_name.replace("text.projection", "text_projection")
+
             if "self_attn" in diffusers_name:
                 if lora_name.startswith(("lora_te_", "lora_te1_")):
                     te_state_dict[diffusers_name] = state_dict.pop(key)
@@ -243,6 +245,10 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
                 else:
                     te2_state_dict[diffusers_name] = state_dict.pop(key)
                     te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            # OneTrainer specificity
+            elif "text_projection" in diffusers_name and lora_name.startswith("lora_te2_"):
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
 
             if (is_te_dora_lora or is_te2_dora_lora) and lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
                 dora_scale_key_to_replace_te = (
@@ -270,7 +276,7 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
             network_alphas.update({new_name: alpha})
 
     if len(state_dict) > 0:
-        raise ValueError(f"The following keys have not been correctly be renamed: \n\n {', '.join(state_dict.keys())}")
+        raise ValueError(f"The following keys have not been correctly renamed: \n\n {', '.join(state_dict.keys())}")
 
     logger.info("Kohya-style checkpoint detected.")
     unet_state_dict = {f"{unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index 7db7bfeda600..cf67da1caed1 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -847,7 +847,12 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
             embed_dims = state_dict["proj_in.weight"].shape[1]
             output_dims = state_dict["proj_out.weight"].shape[0]
             hidden_dims = state_dict["latents"].shape[2]
-            heads = state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+            attn_key_present = any("attn" in k for k in state_dict)
+            heads = (
+                state_dict["layers.0.attn.to_q.weight"].shape[0] // 64
+                if attn_key_present
+                else state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+            )
 
             with init_context():
                 image_projection = IPAdapterPlusImageProjection(
@@ -860,26 +865,53 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
 
             for key, value in state_dict.items():
                 diffusers_name = key.replace("0.to", "2.to")
-                diffusers_name = diffusers_name.replace("1.0.weight", "3.0.weight")
-                diffusers_name = diffusers_name.replace("1.0.bias", "3.0.bias")
-                diffusers_name = diffusers_name.replace("1.1.weight", "3.1.net.0.proj.weight")
-                diffusers_name = diffusers_name.replace("1.3.weight", "3.1.net.2.weight")
 
-                if "norm1" in diffusers_name:
-                    updated_state_dict[diffusers_name.replace("0.norm1", "0")] = value
-                elif "norm2" in diffusers_name:
-                    updated_state_dict[diffusers_name.replace("0.norm2", "1")] = value
-                elif "to_kv" in diffusers_name:
+                diffusers_name = diffusers_name.replace("0.0.norm1", "0.ln0")
+                diffusers_name = diffusers_name.replace("0.0.norm2", "0.ln1")
+                diffusers_name = diffusers_name.replace("1.0.norm1", "1.ln0")
+                diffusers_name = diffusers_name.replace("1.0.norm2", "1.ln1")
+                diffusers_name = diffusers_name.replace("2.0.norm1", "2.ln0")
+                diffusers_name = diffusers_name.replace("2.0.norm2", "2.ln1")
+                diffusers_name = diffusers_name.replace("3.0.norm1", "3.ln0")
+                diffusers_name = diffusers_name.replace("3.0.norm2", "3.ln1")
+
+                if "to_kv" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
                     v_chunk = value.chunk(2, dim=0)
                     updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
                     updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+                elif "to_q" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
+                    updated_state_dict[diffusers_name] = value
                 elif "to_out" in diffusers_name:
+                    parts = diffusers_name.split(".")
+                    parts[2] = "attn"
+                    diffusers_name = ".".join(parts)
                     updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
                 else:
+                    diffusers_name = diffusers_name.replace("0.1.0", "0.ff.0")
+                    diffusers_name = diffusers_name.replace("0.1.1", "0.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("0.1.3", "0.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("1.1.0", "1.ff.0")
+                    diffusers_name = diffusers_name.replace("1.1.1", "1.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("1.1.3", "1.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("2.1.0", "2.ff.0")
+                    diffusers_name = diffusers_name.replace("2.1.1", "2.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("2.1.3", "2.ff.1.net.2")
+
+                    diffusers_name = diffusers_name.replace("3.1.0", "3.ff.0")
+                    diffusers_name = diffusers_name.replace("3.1.1", "3.ff.1.net.0.proj")
+                    diffusers_name = diffusers_name.replace("3.1.3", "3.ff.1.net.2")
                     updated_state_dict[diffusers_name] = value
 
         if not low_cpu_mem_usage:
-            image_projection.load_state_dict(updated_state_dict)
+            image_projection.load_state_dict(updated_state_dict, strict=True)
         else:
             load_model_dict_into_meta(image_projection, updated_state_dict, device=self.device, dtype=self.dtype)
 
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 78b0efff921d..6b29dd5f5460 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -31,6 +31,7 @@
     _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["autoencoders.vq_model"] = ["VQModel"]
     _import_structure["controlnet"] = ["ControlNetModel"]
     _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
@@ -50,7 +51,6 @@
     _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
     _import_structure["unets.unet_stable_cascade"] = ["StableCascadeUNet"]
     _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
-    _import_structure["vq_model"] = ["VQModel"]
 
 if is_flax_available():
     _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
@@ -67,6 +67,7 @@
             AutoencoderKLTemporalDecoder,
             AutoencoderTiny,
             ConsistencyDecoderVAE,
+            VQModel,
         )
         from .controlnet import ControlNetModel
         from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
@@ -92,7 +93,6 @@
             UNetSpatioTemporalConditionModel,
             UVit2DModel,
         )
-        from .vq_model import VQModel
 
     if is_flax_available():
         from .controlnet_flax import FlaxControlNetModel
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
index 201a40ff17b2..5c47748d62e0 100644
--- a/src/diffusers/models/autoencoders/__init__.py
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -3,3 +3,4 @@
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
+from .vq_model import VQModel
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 9d919d374ae6..e8fec3564679 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -245,11 +245,13 @@ def encode(
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a plain
+                tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+                [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is
+                returned.
         """
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
@@ -331,12 +333,13 @@ def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Autoencoder
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a
+                plain tuple.
 
         Returns:
-            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
-                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
-                `tuple` is returned.
+            [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned,
+                otherwise a plain `tuple` is returned.
         """
         overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index 67540cb7dc7f..b73202aedb16 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -323,11 +323,13 @@ def encode(
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a plain
+                tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+                [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is
+                returned.
         """
         h = self.encoder(x)
         moments = self.quant_conv(h)
diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
index 212c46537706..3409549c65dc 100644
--- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py
+++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -284,13 +284,13 @@ def encode(
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain
-                tuple.
+                Whether to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                instead of a plain tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
-                is returned.
+                [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a
+                plain `tuple` is returned.
         """
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
@@ -382,13 +382,13 @@ def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[Consi
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
-                plain tuple.
+                Whether or not to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
-                If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
-                otherwise a plain `tuple` is returned.
+            [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                is returned, otherwise a plain `tuple` is returned.
         """
         overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
diff --git a/src/diffusers/models/autoencoders/vq_model.py b/src/diffusers/models/autoencoders/vq_model.py
new file mode 100644
index 000000000000..2f9e75623eee
--- /dev/null
+++ b/src/diffusers/models/autoencoders/vq_model.py
@@ -0,0 +1,182 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
+from ..modeling_utils import ModelMixin
+
+
+@dataclass
+class VQEncoderOutput(BaseOutput):
+    """
+    Output of VQModel encoding method.
+
+    Args:
+        latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The encoded output sample from the last layer of the model.
+    """
+
+    latents: torch.Tensor
+
+
+class VQModel(ModelMixin, ConfigMixin):
+    r"""
+    A VQ-VAE model for decoding latent representations.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
+        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
+        scaling_factor (`float`, *optional*, defaults to `0.18215`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 3,
+        sample_size: int = 32,
+        num_vq_embeddings: int = 256,
+        norm_num_groups: int = 32,
+        vq_embed_dim: Optional[int] = None,
+        scaling_factor: float = 0.18215,
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        lookup_from_codebook=False,
+        force_upcast=False,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=False,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
+
+        self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
+        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
+        self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_type=norm_type,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+    @apply_forward_hook
+    def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+
+        if not return_dict:
+            return (h,)
+
+        return VQEncoderOutput(latents=h)
+
+    @apply_forward_hook
+    def decode(
+        self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, commit_loss, _ = self.quantize(h)
+        elif self.config.lookup_from_codebook:
+            quant = self.quantize.get_codebook_entry(h, shape)
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
+        else:
+            quant = h
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
+        quant2 = self.post_quant_conv(quant)
+        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
+
+        if not return_dict:
+            return dec, commit_loss
+
+        return DecoderOutput(sample=dec, commit_loss=commit_loss)
+
+    def forward(
+        self, sample: torch.Tensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
+        r"""
+        The [`VQModel`] forward method.
+
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vq_model.VQEncoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+        """
+
+        h = self.encode(sample).latents
+        dec = self.decode(h)
+
+        if not return_dict:
+            return dec.sample, dec.commit_loss
+        return dec
diff --git a/src/diffusers/models/dual_transformer_2d.py b/src/diffusers/models/dual_transformer_2d.py
deleted file mode 100644
index b8e40f14d5a8..000000000000
--- a/src/diffusers/models/dual_transformer_2d.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.dual_transformer_2d import DualTransformer2DModel
-
-
-class DualTransformer2DModel(DualTransformer2DModel):
-    deprecation_message = "Importing `DualTransformer2DModel` from `diffusers.models.dual_transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.dual_transformer_2d import DualTransformer2DModel`, instead."
-    deprecate("DualTransformer2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index d13f8a06cf63..d2940e861c83 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -806,6 +806,39 @@ def forward(self, caption):
         return hidden_states
 
 
+class IPAdapterPlusImageProjectionBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        dim_head: int = 64,
+        heads: int = 16,
+        ffn_ratio: float = 4,
+    ) -> None:
+        super().__init__()
+        from .attention import FeedForward
+
+        self.ln0 = nn.LayerNorm(embed_dims)
+        self.ln1 = nn.LayerNorm(embed_dims)
+        self.attn = Attention(
+            query_dim=embed_dims,
+            dim_head=dim_head,
+            heads=heads,
+            out_bias=False,
+        )
+        self.ff = nn.Sequential(
+            nn.LayerNorm(embed_dims),
+            FeedForward(embed_dims, embed_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
+        )
+
+    def forward(self, x, latents, residual):
+        encoder_hidden_states = self.ln0(x)
+        latents = self.ln1(latents)
+        encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
+        latents = self.attn(latents, encoder_hidden_states) + residual
+        latents = self.ff(latents) + latents
+        return latents
+
+
 class IPAdapterPlusImageProjection(nn.Module):
     """Resampler of IP-Adapter Plus.
 
@@ -834,8 +867,6 @@ def __init__(
         ffn_ratio: float = 4,
     ) -> None:
         super().__init__()
-        from .attention import FeedForward  # Lazy import to avoid circular import
-
         self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dims) / hidden_dims**0.5)
 
         self.proj_in = nn.Linear(embed_dims, hidden_dims)
@@ -843,26 +874,9 @@ def __init__(
         self.proj_out = nn.Linear(hidden_dims, output_dims)
         self.norm_out = nn.LayerNorm(output_dims)
 
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        nn.LayerNorm(hidden_dims),
-                        nn.LayerNorm(hidden_dims),
-                        Attention(
-                            query_dim=hidden_dims,
-                            dim_head=dim_head,
-                            heads=heads,
-                            out_bias=False,
-                        ),
-                        nn.Sequential(
-                            nn.LayerNorm(hidden_dims),
-                            FeedForward(hidden_dims, hidden_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
-                        ),
-                    ]
-                )
-            )
+        self.layers = nn.ModuleList(
+            [IPAdapterPlusImageProjectionBlock(hidden_dims, dim_head, heads, ffn_ratio) for _ in range(depth)]
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass.
@@ -876,52 +890,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         x = self.proj_in(x)
 
-        for ln0, ln1, attn, ff in self.layers:
+        for block in self.layers:
             residual = latents
-
-            encoder_hidden_states = ln0(x)
-            latents = ln1(latents)
-            encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
-            latents = attn(latents, encoder_hidden_states) + residual
-            latents = ff(latents) + latents
+            latents = block(x, latents, residual)
 
         latents = self.proj_out(latents)
         return self.norm_out(latents)
 
 
-class IPAdapterPlusImageProjectionBlock(nn.Module):
-    def __init__(
-        self,
-        embed_dims: int = 768,
-        dim_head: int = 64,
-        heads: int = 16,
-        ffn_ratio: float = 4,
-    ) -> None:
-        super().__init__()
-        from .attention import FeedForward
-
-        self.ln0 = nn.LayerNorm(embed_dims)
-        self.ln1 = nn.LayerNorm(embed_dims)
-        self.attn = Attention(
-            query_dim=embed_dims,
-            dim_head=dim_head,
-            heads=heads,
-            out_bias=False,
-        )
-        self.ff = nn.Sequential(
-            nn.LayerNorm(embed_dims),
-            FeedForward(embed_dims, embed_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
-        )
-
-    def forward(self, x, latents, residual):
-        encoder_hidden_states = self.ln0(x)
-        latents = self.ln1(latents)
-        encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
-        latents = self.attn(latents, encoder_hidden_states) + residual
-        latents = self.ff(latents) + latents
-        return latents
-
-
 class IPAdapterFaceIDPlusImageProjection(nn.Module):
     """FacePerceiverResampler of IP-Adapter Plus.
 
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
deleted file mode 100644
index 328835a95381..000000000000
--- a/src/diffusers/models/prior_transformer.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from ..utils import deprecate
-from .transformers.prior_transformer import PriorTransformer, PriorTransformerOutput
-
-
-class PriorTransformerOutput(PriorTransformerOutput):
-    deprecation_message = "Importing `PriorTransformerOutput` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformerOutput`, instead."
-    deprecate("PriorTransformerOutput", "0.29", deprecation_message)
-
-
-class PriorTransformer(PriorTransformer):
-    deprecation_message = "Importing `PriorTransformer` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformer`, instead."
-    deprecate("PriorTransformer", "0.29", deprecation_message)
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
deleted file mode 100644
index 6aa5ff7449de..000000000000
--- a/src/diffusers/models/t5_film_transformer.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.t5_film_transformer import (
-    DecoderLayer,
-    NewGELUActivation,
-    T5DenseGatedActDense,
-    T5FilmDecoder,
-    T5FiLMLayer,
-    T5LayerCrossAttention,
-    T5LayerFFCond,
-    T5LayerNorm,
-    T5LayerSelfAttentionCond,
-)
-
-
-class T5FilmDecoder(T5FilmDecoder):
-    deprecation_message = "Importing `T5FilmDecoder` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FilmDecoder`, instead."
-    deprecate("T5FilmDecoder", "0.29", deprecation_message)
-
-
-class DecoderLayer(DecoderLayer):
-    deprecation_message = "Importing `DecoderLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import DecoderLayer`, instead."
-    deprecate("DecoderLayer", "0.29", deprecation_message)
-
-
-class T5LayerSelfAttentionCond(T5LayerSelfAttentionCond):
-    deprecation_message = "Importing `T5LayerSelfAttentionCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerSelfAttentionCond`, instead."
-    deprecate("T5LayerSelfAttentionCond", "0.29", deprecation_message)
-
-
-class T5LayerCrossAttention(T5LayerCrossAttention):
-    deprecation_message = "Importing `T5LayerCrossAttention` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerCrossAttention`, instead."
-    deprecate("T5LayerCrossAttention", "0.29", deprecation_message)
-
-
-class T5LayerFFCond(T5LayerFFCond):
-    deprecation_message = "Importing `T5LayerFFCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerFFCond`, instead."
-    deprecate("T5LayerFFCond", "0.29", deprecation_message)
-
-
-class T5DenseGatedActDense(T5DenseGatedActDense):
-    deprecation_message = "Importing `T5DenseGatedActDense` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5DenseGatedActDense`, instead."
-    deprecate("T5DenseGatedActDense", "0.29", deprecation_message)
-
-
-class T5LayerNorm(T5LayerNorm):
-    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerNorm`, instead."
-    deprecate("T5LayerNorm", "0.29", deprecation_message)
-
-
-class NewGELUActivation(NewGELUActivation):
-    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import NewGELUActivation`, instead."
-    deprecate("NewGELUActivation", "0.29", deprecation_message)
-
-
-class T5FiLMLayer(T5FiLMLayer):
-    deprecation_message = "Importing `T5FiLMLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FiLMLayer`, instead."
-    deprecate("T5FiLMLayer", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
deleted file mode 100644
index 5d8ef1347af1..000000000000
--- a/src/diffusers/models/transformer_2d.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.transformer_2d import Transformer2DModel, Transformer2DModelOutput
-
-
-class Transformer2DModelOutput(Transformer2DModelOutput):
-    deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead."
-    deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
-
-
-class Transformer2DModel(Transformer2DModel):
-    deprecation_message = "Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead."
-    deprecate("Transformer2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformer_temporal.py b/src/diffusers/models/transformer_temporal.py
deleted file mode 100644
index 02e504580238..000000000000
--- a/src/diffusers/models/transformer_temporal.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.transformer_temporal import (
-    TransformerSpatioTemporalModel,
-    TransformerTemporalModel,
-    TransformerTemporalModelOutput,
-)
-
-
-class TransformerTemporalModelOutput(TransformerTemporalModelOutput):
-    deprecation_message = "Importing `TransformerTemporalModelOutput` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModelOutput`, instead."
-    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
-
-
-class TransformerTemporalModel(TransformerTemporalModel):
-    deprecation_message = "Importing `TransformerTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModel`, instead."
-    deprecate("TransformerTemporalModel", "0.29", deprecation_message)
-
-
-class TransformerSpatioTemporalModel(TransformerSpatioTemporalModel):
-    deprecation_message = "Importing `TransformerSpatioTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerSpatioTemporalModel`, instead."
-    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py
index edc8cbf78382..bbf2d387e43f 100644
--- a/src/diffusers/models/transformers/dual_transformer_2d.py
+++ b/src/diffusers/models/transformers/dual_transformer_2d.py
@@ -123,9 +123,9 @@ def forward(
                 tuple.
 
         Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
         """
         input_states = hidden_states
 
diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py
index 8dbcfc64e09c..edac0d9e9388 100644
--- a/src/diffusers/models/transformers/prior_transformer.py
+++ b/src/diffusers/models/transformers/prior_transformer.py
@@ -266,13 +266,13 @@ def forward(
             attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
                 Text mask for the text embeddings.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
-                tuple.
+                Whether or not to return a [`~models.transformers.prior_transformer.PriorTransformerOutput`] instead of
+                a plain tuple.
 
         Returns:
-            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
-                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
+            [`~models.transformers.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.transformers.prior_transformer.PriorTransformerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
         """
         batch_size = hidden_states.shape[0]
 
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
index ef9e0de0b662..5f21b2f0e785 100644
--- a/src/diffusers/models/transformers/transformer_2d.py
+++ b/src/diffusers/models/transformers/transformer_2d.py
@@ -377,8 +377,8 @@ def forward(
                 tuple.
 
         Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
+            If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
+            otherwise a `tuple` where the first element is the sample tensor.
         """
         if cross_attention_kwargs is not None:
             if cross_attention_kwargs.get("scale", None) is not None:
diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py
index 2e1bb041a207..c0c5467050dd 100644
--- a/src/diffusers/models/transformers/transformer_temporal.py
+++ b/src/diffusers/models/transformers/transformer_temporal.py
@@ -149,13 +149,14 @@ def forward(
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
-                tuple.
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
-                returned, otherwise a `tuple` where the first element is the sample tensor.
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
         """
         # 1. Input
         batch_frames, channel, height, width = hidden_states.shape
@@ -294,13 +295,14 @@ def forward(
                 A tensor indicating whether the input contains only images. 1 indicates that the input contains only
                 images, 0 indicates that the input contains video frames.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a
-                plain tuple.
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
-                returned, otherwise a `tuple` where the first element is the sample tensor.
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
         """
         # 1. Input
         batch_frames, _, height, width = hidden_states.shape
diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
deleted file mode 100644
index e857c90cae40..000000000000
--- a/src/diffusers/models/unet_1d.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..utils import deprecate
-from .unets.unet_1d import UNet1DModel, UNet1DOutput
-
-
-class UNet1DOutput(UNet1DOutput):
-    deprecation_message = "Importing `UNet1DOutput` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DOutput`, instead."
-    deprecate("UNet1DOutput", "0.29", deprecation_message)
-
-
-class UNet1DModel(UNet1DModel):
-    deprecation_message = "Importing `UNet1DModel` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DModel`, instead."
-    deprecate("UNet1DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py
deleted file mode 100644
index 6b0f09457d17..000000000000
--- a/src/diffusers/models/unet_1d_blocks.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..utils import deprecate
-from .unets.unet_1d_blocks import (
-    AttnDownBlock1D,
-    AttnUpBlock1D,
-    DownBlock1D,
-    DownBlock1DNoSkip,
-    DownResnetBlock1D,
-    Downsample1d,
-    MidResTemporalBlock1D,
-    OutConv1DBlock,
-    OutValueFunctionBlock,
-    ResConvBlock,
-    SelfAttention1d,
-    UNetMidBlock1D,
-    UpBlock1D,
-    UpBlock1DNoSkip,
-    UpResnetBlock1D,
-    Upsample1d,
-    ValueFunctionMidBlock1D,
-)
-
-
-class DownResnetBlock1D(DownResnetBlock1D):
-    deprecation_message = "Importing `DownResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownResnetBlock1D`, instead."
-    deprecate("DownResnetBlock1D", "0.29", deprecation_message)
-
-
-class UpResnetBlock1D(UpResnetBlock1D):
-    deprecation_message = "Importing `UpResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpResnetBlock1D`, instead."
-    deprecate("UpResnetBlock1D", "0.29", deprecation_message)
-
-
-class ValueFunctionMidBlock1D(ValueFunctionMidBlock1D):
-    deprecation_message = "Importing `ValueFunctionMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ValueFunctionMidBlock1D`, instead."
-    deprecate("ValueFunctionMidBlock1D", "0.29", deprecation_message)
-
-
-class OutConv1DBlock(OutConv1DBlock):
-    deprecation_message = "Importing `OutConv1DBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutConv1DBlock`, instead."
-    deprecate("OutConv1DBlock", "0.29", deprecation_message)
-
-
-class OutValueFunctionBlock(OutValueFunctionBlock):
-    deprecation_message = "Importing `OutValueFunctionBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutValueFunctionBlock`, instead."
-    deprecate("OutValueFunctionBlock", "0.29", deprecation_message)
-
-
-class Downsample1d(Downsample1d):
-    deprecation_message = "Importing `Downsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Downsample1d`, instead."
-    deprecate("Downsample1d", "0.29", deprecation_message)
-
-
-class Upsample1d(Upsample1d):
-    deprecation_message = "Importing `Upsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Upsample1d`, instead."
-    deprecate("Upsample1d", "0.29", deprecation_message)
-
-
-class SelfAttention1d(SelfAttention1d):
-    deprecation_message = "Importing `SelfAttention1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import SelfAttention1d`, instead."
-    deprecate("SelfAttention1d", "0.29", deprecation_message)
-
-
-class ResConvBlock(ResConvBlock):
-    deprecation_message = "Importing `ResConvBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ResConvBlock`, instead."
-    deprecate("ResConvBlock", "0.29", deprecation_message)
-
-
-class UNetMidBlock1D(UNetMidBlock1D):
-    deprecation_message = "Importing `UNetMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UNetMidBlock1D`, instead."
-    deprecate("UNetMidBlock1D", "0.29", deprecation_message)
-
-
-class AttnDownBlock1D(AttnDownBlock1D):
-    deprecation_message = "Importing `AttnDownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnDownBlock1D`, instead."
-    deprecate("AttnDownBlock1D", "0.29", deprecation_message)
-
-
-class DownBlock1D(DownBlock1D):
-    deprecation_message = "Importing `DownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1D`, instead."
-    deprecate("DownBlock1D", "0.29", deprecation_message)
-
-
-class DownBlock1DNoSkip(DownBlock1DNoSkip):
-    deprecation_message = "Importing `DownBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1DNoSkip`, instead."
-    deprecate("DownBlock1DNoSkip", "0.29", deprecation_message)
-
-
-class AttnUpBlock1D(AttnUpBlock1D):
-    deprecation_message = "Importing `AttnUpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnUpBlock1D`, instead."
-    deprecate("AttnUpBlock1D", "0.29", deprecation_message)
-
-
-class UpBlock1D(UpBlock1D):
-    deprecation_message = "Importing `UpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1D`, instead."
-    deprecate("UpBlock1D", "0.29", deprecation_message)
-
-
-class UpBlock1DNoSkip(UpBlock1DNoSkip):
-    deprecation_message = "Importing `UpBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1DNoSkip`, instead."
-    deprecate("UpBlock1DNoSkip", "0.29", deprecation_message)
-
-
-class MidResTemporalBlock1D(MidResTemporalBlock1D):
-    deprecation_message = "Importing `MidResTemporalBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import MidResTemporalBlock1D`, instead."
-    deprecate("MidResTemporalBlock1D", "0.29", deprecation_message)
-
-
-def get_down_block(
-    down_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    temb_channels: int,
-    add_downsample: bool,
-):
-    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_down_block`, instead."
-    deprecate("get_down_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_down_block
-
-    return get_down_block(
-        down_block_type=down_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_downsample=add_downsample,
-    )
-
-
-def get_up_block(
-    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
-):
-    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_up_block`, instead."
-    deprecate("get_up_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_up_block
-
-    return get_up_block(
-        up_block_type=up_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_upsample=add_upsample,
-    )
-
-
-def get_mid_block(
-    mid_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    mid_channels: int,
-    out_channels: int,
-    embed_dim: int,
-    add_downsample: bool,
-):
-    deprecation_message = "Importing `get_mid_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_mid_block`, instead."
-    deprecate("get_mid_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_mid_block
-
-    return get_mid_block(
-        mid_block_type=mid_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        mid_channels=mid_channels,
-        out_channels=out_channels,
-        embed_dim=embed_dim,
-        add_downsample=add_downsample,
-    )
-
-
-def get_out_block(
-    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
-):
-    deprecation_message = "Importing `get_out_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_out_block`, instead."
-    deprecate("get_out_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_out_block
-
-    return get_out_block(
-        out_block_type=out_block_type,
-        num_groups_out=num_groups_out,
-        embed_dim=embed_dim,
-        out_channels=out_channels,
-        act_fn=act_fn,
-        fc_dim=fc_dim,
-    )
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
deleted file mode 100644
index 21f1fea68d6c..000000000000
--- a/src/diffusers/models/unet_2d.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ..utils import deprecate
-from .unets.unet_2d import UNet2DModel, UNet2DOutput
-
-
-class UNet2DOutput(UNet2DOutput):
-    deprecation_message = "Importing `UNet2DOutput` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DOutput`, instead."
-    deprecate("UNet2DOutput", "0.29", deprecation_message)
-
-
-class UNet2DModel(UNet2DModel):
-    deprecation_message = "Importing `UNet2DModel` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DModel`, instead."
-    deprecate("UNet2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
deleted file mode 100644
index 931fa89a73f9..000000000000
--- a/src/diffusers/models/unet_2d_blocks.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from ..utils import deprecate
-from .unets.unet_2d_blocks import (
-    AttnDownBlock2D,
-    AttnDownEncoderBlock2D,
-    AttnSkipDownBlock2D,
-    AttnSkipUpBlock2D,
-    AttnUpBlock2D,
-    AttnUpDecoderBlock2D,
-    AutoencoderTinyBlock,
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
-    KAttentionBlock,
-    KCrossAttnDownBlock2D,
-    KCrossAttnUpBlock2D,
-    KDownBlock2D,
-    KUpBlock2D,
-    ResnetDownsampleBlock2D,
-    ResnetUpsampleBlock2D,
-    SimpleCrossAttnDownBlock2D,
-    SimpleCrossAttnUpBlock2D,
-    SkipDownBlock2D,
-    SkipUpBlock2D,
-    UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
-    UpDecoderBlock2D,
-)
-
-
-def get_down_block(
-    down_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    temb_channels: int,
-    add_downsample: bool,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    resnet_groups: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    downsample_padding: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    resnet_out_scale_factor: float = 1.0,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = None,
-    downsample_type: Optional[str] = None,
-    dropout: float = 0.0,
-):
-    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_down_block`, instead."
-    deprecate("get_down_block", "0.29", deprecation_message)
-
-    from .unets.unet_2d_blocks import get_down_block
-
-    return get_down_block(
-        down_block_type=down_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_downsample=add_downsample,
-        resnet_eps=resnet_eps,
-        resnet_act_fn=resnet_act_fn,
-        transformer_layers_per_block=transformer_layers_per_block,
-        num_attention_heads=num_attention_heads,
-        resnet_groups=resnet_groups,
-        cross_attention_dim=cross_attention_dim,
-        downsample_padding=downsample_padding,
-        dual_cross_attention=dual_cross_attention,
-        use_linear_projection=use_linear_projection,
-        only_cross_attention=only_cross_attention,
-        upcast_attention=upcast_attention,
-        resnet_time_scale_shift=resnet_time_scale_shift,
-        attention_type=attention_type,
-        resnet_skip_time_act=resnet_skip_time_act,
-        resnet_out_scale_factor=resnet_out_scale_factor,
-        cross_attention_norm=cross_attention_norm,
-        attention_head_dim=attention_head_dim,
-        downsample_type=downsample_type,
-        dropout=dropout,
-    )
-
-
-def get_mid_block(
-    mid_block_type: str,
-    temb_channels: int,
-    in_channels: int,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    resnet_groups: int,
-    output_scale_factor: float = 1.0,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    mid_block_only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = 1,
-    dropout: float = 0.0,
-):
-    if mid_block_type == "UNetMidBlock2DCrossAttn":
-        return UNetMidBlock2DCrossAttn(
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads,
-            resnet_groups=resnet_groups,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-            attention_type=attention_type,
-        )
-    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-        return UNetMidBlock2DSimpleCrossAttn(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=attention_head_dim,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            only_cross_attention=mid_block_only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-        )
-    elif mid_block_type == "UNetMidBlock2D":
-        return UNetMidBlock2D(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            num_layers=0,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            add_attention=False,
-        )
-    elif mid_block_type is None:
-        return None
-    else:
-        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-
-def get_up_block(
-    up_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    prev_output_channel: int,
-    temb_channels: int,
-    add_upsample: bool,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    resolution_idx: Optional[int] = None,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    resnet_groups: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    resnet_out_scale_factor: float = 1.0,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = None,
-    upsample_type: Optional[str] = None,
-    dropout: float = 0.0,
-):
-    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_up_block`, instead."
-    deprecate("get_up_block", "0.29", deprecation_message)
-
-    from .unets.unet_2d_blocks import get_up_block
-
-    return get_up_block(
-        up_block_type=up_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        prev_output_channel=prev_output_channel,
-        temb_channels=temb_channels,
-        add_upsample=add_upsample,
-        resnet_eps=resnet_eps,
-        resnet_act_fn=resnet_act_fn,
-        resolution_idx=resolution_idx,
-        transformer_layers_per_block=transformer_layers_per_block,
-        num_attention_heads=num_attention_heads,
-        resnet_groups=resnet_groups,
-        cross_attention_dim=cross_attention_dim,
-        dual_cross_attention=dual_cross_attention,
-        use_linear_projection=use_linear_projection,
-        only_cross_attention=only_cross_attention,
-        upcast_attention=upcast_attention,
-        resnet_time_scale_shift=resnet_time_scale_shift,
-        attention_type=attention_type,
-        resnet_skip_time_act=resnet_skip_time_act,
-        resnet_out_scale_factor=resnet_out_scale_factor,
-        cross_attention_norm=cross_attention_norm,
-        attention_head_dim=attention_head_dim,
-        upsample_type=upsample_type,
-        dropout=dropout,
-    )
-
-
-class AutoencoderTinyBlock(AutoencoderTinyBlock):
-    deprecation_message = "Importing `AutoencoderTinyBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AutoencoderTinyBlock`, instead."
-    deprecate("AutoencoderTinyBlock", "0.29", deprecation_message)
-
-
-class UNetMidBlock2D(UNetMidBlock2D):
-    deprecation_message = "Importing `UNetMidBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D`, instead."
-    deprecate("UNetMidBlock2D", "0.29", deprecation_message)
-
-
-class UNetMidBlock2DCrossAttn(UNetMidBlock2DCrossAttn):
-    deprecation_message = "Importing `UNetMidBlock2DCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DCrossAttn`, instead."
-    deprecate("UNetMidBlock2DCrossAttn", "0.29", deprecation_message)
-
-
-class UNetMidBlock2DSimpleCrossAttn(UNetMidBlock2DSimpleCrossAttn):
-    deprecation_message = "Importing `UNetMidBlock2DSimpleCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DSimpleCrossAttn`, instead."
-    deprecate("UNetMidBlock2DSimpleCrossAttn", "0.29", deprecation_message)
-
-
-class AttnDownBlock2D(AttnDownBlock2D):
-    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownBlock2D`, instead."
-    deprecate("AttnDownBlock2D", "0.29", deprecation_message)
-
-
-class CrossAttnDownBlock2D(CrossAttnDownBlock2D):
-    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D`, instead."
-    deprecate("CrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class DownBlock2D(DownBlock2D):
-    deprecation_message = "Importing `DownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import DownBlock2D`, instead."
-    deprecate("DownBlock2D", "0.29", deprecation_message)
-
-
-class AttnDownEncoderBlock2D(AttnDownEncoderBlock2D):
-    deprecation_message = "Importing `AttnDownEncoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownEncoderBlock2D`, instead."
-    deprecate("AttnDownEncoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnSkipDownBlock2D(AttnSkipDownBlock2D):
-    deprecation_message = "Importing `AttnSkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipDownBlock2D`, instead."
-    deprecate("AttnSkipDownBlock2D", "0.29", deprecation_message)
-
-
-class SkipDownBlock2D(SkipDownBlock2D):
-    deprecation_message = "Importing `SkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipDownBlock2D`, instead."
-    deprecate("SkipDownBlock2D", "0.29", deprecation_message)
-
-
-class ResnetDownsampleBlock2D(ResnetDownsampleBlock2D):
-    deprecation_message = "Importing `ResnetDownsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D`, instead."
-    deprecate("ResnetDownsampleBlock2D", "0.29", deprecation_message)
-
-
-class SimpleCrossAttnDownBlock2D(SimpleCrossAttnDownBlock2D):
-    deprecation_message = "Importing `SimpleCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnDownBlock2D`, instead."
-    deprecate("SimpleCrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class KDownBlock2D(KDownBlock2D):
-    deprecation_message = "Importing `KDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KDownBlock2D`, instead."
-    deprecate("KDownBlock2D", "0.29", deprecation_message)
-
-
-class KCrossAttnDownBlock2D(KCrossAttnDownBlock2D):
-    deprecation_message = "Importing `KCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnDownBlock2D`, instead."
-    deprecate("KCrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class AttnUpBlock2D(AttnUpBlock2D):
-    deprecation_message = "Importing `AttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpBlock2D`, instead."
-    deprecate("AttnUpBlock2D", "0.29", deprecation_message)
-
-
-class CrossAttnUpBlock2D(CrossAttnUpBlock2D):
-    deprecation_message = "Importing `CrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnUpBlock2D`, instead."
-    deprecate("CrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-class UpBlock2D(UpBlock2D):
-    deprecation_message = "Importing `UpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpBlock2D`, instead."
-    deprecate("UpBlock2D", "0.29", deprecation_message)
-
-
-class UpDecoderBlock2D(UpDecoderBlock2D):
-    deprecation_message = "Importing `UpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpDecoderBlock2D`, instead."
-    deprecate("UpDecoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnUpDecoderBlock2D(AttnUpDecoderBlock2D):
-    deprecation_message = "Importing `AttnUpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpDecoderBlock2D`, instead."
-    deprecate("AttnUpDecoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnSkipUpBlock2D(AttnSkipUpBlock2D):
-    deprecation_message = "Importing `AttnSkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipUpBlock2D`, instead."
-    deprecate("AttnSkipUpBlock2D", "0.29", deprecation_message)
-
-
-class SkipUpBlock2D(SkipUpBlock2D):
-    deprecation_message = "Importing `SkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipUpBlock2D`, instead."
-    deprecate("SkipUpBlock2D", "0.29", deprecation_message)
-
-
-class ResnetUpsampleBlock2D(ResnetUpsampleBlock2D):
-    deprecation_message = "Importing `ResnetUpsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetUpsampleBlock2D`, instead."
-    deprecate("ResnetUpsampleBlock2D", "0.29", deprecation_message)
-
-
-class SimpleCrossAttnUpBlock2D(SimpleCrossAttnUpBlock2D):
-    deprecation_message = "Importing `SimpleCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnUpBlock2D`, instead."
-    deprecate("SimpleCrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-class KUpBlock2D(KUpBlock2D):
-    deprecation_message = "Importing `KUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KUpBlock2D`, instead."
-    deprecate("KUpBlock2D", "0.29", deprecation_message)
-
-
-class KCrossAttnUpBlock2D(KCrossAttnUpBlock2D):
-    deprecation_message = "Importing `KCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnUpBlock2D`, instead."
-    deprecate("KCrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-# can potentially later be renamed to `No-feed-forward` attention
-class KAttentionBlock(KAttentionBlock):
-    deprecation_message = "Importing `KAttentionBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KAttentionBlock`, instead."
-    deprecate("KAttentionBlock", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
deleted file mode 100644
index 85a3e7b09197..000000000000
--- a/src/diffusers/models/unet_2d_condition.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
-
-
-class UNet2DConditionOutput(UNet2DConditionOutput):
-    deprecation_message = "Importing `UNet2DConditionOutput` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput`, instead."
-    deprecate("UNet2DConditionOutput", "0.29", deprecation_message)
-
-
-class UNet2DConditionModel(UNet2DConditionModel):
-    deprecation_message = "Importing `UNet2DConditionModel` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel`, instead."
-    deprecate("UNet2DConditionModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py
index d1538cdc61d9..8efabd98ee7d 100644
--- a/src/diffusers/models/unets/unet_1d.py
+++ b/src/diffusers/models/unets/unet_1d.py
@@ -206,11 +206,11 @@ def forward(
                 The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
             timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unets.unet_1d.UNet1DOutput`] instead of a plain tuple.
 
         Returns:
-            [`~models.unet_1d.UNet1DOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
+            [`~models.unets.unet_1d.UNet1DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
                 returned where the first element is the sample tensor.
         """
 
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
index 0f36afe3f931..5972505f2897 100644
--- a/src/diffusers/models/unets/unet_2d.py
+++ b/src/diffusers/models/unets/unet_2d.py
@@ -257,11 +257,11 @@ def forward(
             class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                 Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.
 
         Returns:
-            [`~models.unet_2d.UNet2DOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
+            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
                 returned where the first element is the sample tensor.
         """
         # 0. center input if necessary
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index ad45a43b5023..084b7b64f9ab 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -110,13 +110,13 @@ class UNet2DConditionModel(
             The dimension of the cross attention features.
         transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
         reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
             blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
         encoder_hid_dim (`int`, *optional*, defaults to None):
             If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
             dimension to `cross_attention_dim`.
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
index b4879fe9639c..331c8fba444d 100644
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -598,15 +598,15 @@ def forward(
             mid_block_additional_residual: (`torch.Tensor`, *optional*):
                 A tensor that if specified is added to the residual of the middle unet block.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
             cross_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
index dbfb4f80259d..276f1059bf01 100644
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -542,13 +542,13 @@ def forward(
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         batch_size, channels, num_frames, height, width = sample.shape
 
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 1b62d16d5d77..b224d9d73317 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -856,13 +856,13 @@ def forward(
             mid_block_additional_residual: (`torch.Tensor`, *optional*):
                 A tensor that if specified is added to the residual of the middle unet block.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
index 5613e3618d02..bc3acdbece1a 100644
--- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py
+++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
@@ -57,9 +57,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
             The dimension of the cross attention features.
         transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
-            [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
-            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
         num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
             The number of attention heads.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index cb32b1f40734..71aeb09049bb 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -11,172 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from ..utils import deprecate
+from .autoencoders.vq_model import VQEncoderOutput, VQModel
 
-import torch
-import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
-from .modeling_utils import ModelMixin
+class VQEncoderOutput(VQEncoderOutput):
+    deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
+    deprecate("VQEncoderOutput", "0.31", deprecation_message)
 
 
-@dataclass
-class VQEncoderOutput(BaseOutput):
-    """
-    Output of VQModel encoding method.
-
-    Args:
-        latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            The encoded output sample from the last layer of the model.
-    """
-
-    latents: torch.Tensor
-
-
-class VQModel(ModelMixin, ConfigMixin):
-    r"""
-    A VQ-VAE model for decoding latent representations.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
-        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
-        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
-        scaling_factor (`float`, *optional*, defaults to `0.18215`):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-        norm_type (`str`, *optional*, defaults to `"group"`):
-            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int, ...] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 3,
-        sample_size: int = 32,
-        num_vq_embeddings: int = 256,
-        norm_num_groups: int = 32,
-        vq_embed_dim: Optional[int] = None,
-        scaling_factor: float = 0.18215,
-        norm_type: str = "group",  # group, spatial
-        mid_block_add_attention=True,
-        lookup_from_codebook=False,
-        force_upcast=False,
-    ):
-        super().__init__()
-
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=False,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-
-        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
-
-        self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
-        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
-        self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
-
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            norm_type=norm_type,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-
-    @apply_forward_hook
-    def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-
-        if not return_dict:
-            return (h,)
-
-        return VQEncoderOutput(latents=h)
-
-    @apply_forward_hook
-    def decode(
-        self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, commit_loss, _ = self.quantize(h)
-        elif self.config.lookup_from_codebook:
-            quant = self.quantize.get_codebook_entry(h, shape)
-            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
-        else:
-            quant = h
-            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
-        quant2 = self.post_quant_conv(quant)
-        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
-
-        if not return_dict:
-            return dec, commit_loss
-
-        return DecoderOutput(sample=dec, commit_loss=commit_loss)
-
-    def forward(
-        self, sample: torch.Tensor, return_dict: bool = True
-    ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
-        r"""
-        The [`VQModel`] forward method.
-
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.vq_model.VQEncoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
-                is returned.
-        """
-
-        h = self.encode(sample).latents
-        dec = self.decode(h)
-
-        if not return_dict:
-            return dec.sample, dec.commit_loss
-        return dec
+class VQModel(VQModel):
+    deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
+    deprecate("VQModel", "0.31", deprecation_message)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index c2dd7ac0d551..36031e546a77 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -24,6 +24,7 @@
     "deprecated": [],
     "latent_diffusion": [],
     "ledits_pp": [],
+    "marigold": [],
     "stable_diffusion": [],
     "stable_diffusion_xl": [],
 }
@@ -185,6 +186,12 @@
             "LEditsPPPipelineStableDiffusionXL",
         ]
     )
+    _import_structure["marigold"].extend(
+        [
+            "MarigoldDepthPipeline",
+            "MarigoldNormalsPipeline",
+        ]
+    )
     _import_structure["musicldm"] = ["MusicLDMPipeline"]
     _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
     _import_structure["pia"] = ["PIAPipeline"]
@@ -448,6 +455,10 @@
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
         )
+        from .marigold import (
+            MarigoldDepthPipeline,
+            MarigoldNormalsPipeline,
+        )
         from .musicldm import MusicLDMPipeline
         from .paint_by_example import PaintByExamplePipeline
         from .pia import PIAPipeline
diff --git a/src/diffusers/pipelines/marigold/__init__.py b/src/diffusers/pipelines/marigold/__init__.py
new file mode 100644
index 000000000000..b5ae03adfc11
--- /dev/null
+++ b/src/diffusers/pipelines/marigold/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
+    _import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
+    _import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .marigold_image_processing import MarigoldImageProcessor
+        from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
+        from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/marigold/marigold_image_processing.py b/src/diffusers/pipelines/marigold/marigold_image_processing.py
new file mode 100644
index 000000000000..51b9983db6f6
--- /dev/null
+++ b/src/diffusers/pipelines/marigold/marigold_image_processing.py
@@ -0,0 +1,576 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from PIL import Image
+
+from ... import ConfigMixin
+from ...configuration_utils import register_to_config
+from ...image_processor import PipelineImageInput
+from ...utils import CONFIG_NAME, logging
+from ...utils.import_utils import is_matplotlib_available
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class MarigoldImageProcessor(ConfigMixin):
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        vae_scale_factor: int = 8,
+        do_normalize: bool = True,
+        do_range_check: bool = True,
+    ):
+        super().__init__()
+
+    @staticmethod
+    def expand_tensor_or_array(images: Union[torch.Tensor, np.ndarray]) -> Union[torch.Tensor, np.ndarray]:
+        """
+        Expand a tensor or array to a specified number of images.
+        """
+        if isinstance(images, np.ndarray):
+            if images.ndim == 2:  # [H,W] -> [1,H,W,1]
+                images = images[None, ..., None]
+            if images.ndim == 3:  # [H,W,C] -> [1,H,W,C]
+                images = images[None]
+        elif isinstance(images, torch.Tensor):
+            if images.ndim == 2:  # [H,W] -> [1,1,H,W]
+                images = images[None, None]
+            elif images.ndim == 3:  # [1,H,W] -> [1,1,H,W]
+                images = images[None]
+        else:
+            raise ValueError(f"Unexpected input type: {type(images)}")
+        return images
+
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+
+    @staticmethod
+    def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+        """
+        Convert a NumPy image to a PyTorch tensor.
+        """
+        if np.issubdtype(images.dtype, np.integer) and not np.issubdtype(images.dtype, np.unsignedinteger):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be a signed integer.")
+        if np.issubdtype(images.dtype, np.complexfloating):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be complex.")
+        if np.issubdtype(images.dtype, bool):
+            raise ValueError(f"Input image dtype={images.dtype} cannot be boolean.")
+
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+
+    @staticmethod
+    def resize_antialias(
+        image: torch.Tensor, size: Tuple[int, int], mode: str, is_aa: Optional[bool] = None
+    ) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        antialias = is_aa and mode in ("bilinear", "bicubic")
+        image = F.interpolate(image, size, mode=mode, antialias=antialias)
+
+        return image
+
+    @staticmethod
+    def resize_to_max_edge(image: torch.Tensor, max_edge_sz: int, mode: str) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        h, w = image.shape[-2:]
+        max_orig = max(h, w)
+        new_h = h * max_edge_sz // max_orig
+        new_w = w * max_edge_sz // max_orig
+
+        if new_h == 0 or new_w == 0:
+            raise ValueError(f"Extreme aspect ratio of the input image: [{w} x {h}]")
+
+        image = MarigoldImageProcessor.resize_antialias(image, (new_h, new_w), mode, is_aa=True)
+
+        return image
+
+    @staticmethod
+    def pad_image(image: torch.Tensor, align: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        h, w = image.shape[-2:]
+        ph, pw = -h % align, -w % align
+
+        image = F.pad(image, (0, pw, 0, ph), mode="replicate")
+
+        return image, (ph, pw)
+
+    @staticmethod
+    def unpad_image(image: torch.Tensor, padding: Tuple[int, int]) -> torch.Tensor:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.dim() != 4:
+            raise ValueError(f"Invalid input dimensions; shape={image.shape}.")
+
+        ph, pw = padding
+        uh = None if ph == 0 else -ph
+        uw = None if pw == 0 else -pw
+
+        image = image[:, :, :uh, :uw]
+
+        return image
+
+    @staticmethod
+    def load_image_canonical(
+        image: Union[torch.Tensor, np.ndarray, Image.Image],
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ) -> Tuple[torch.Tensor, int]:
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+
+        image_dtype_max = None
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            image = MarigoldImageProcessor.expand_tensor_or_array(image)
+            if image.ndim != 4:
+                raise ValueError("Input image is not 2-, 3-, or 4-dimensional.")
+        if isinstance(image, np.ndarray):
+            if np.issubdtype(image.dtype, np.integer) and not np.issubdtype(image.dtype, np.unsignedinteger):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be a signed integer.")
+            if np.issubdtype(image.dtype, np.complexfloating):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be complex.")
+            if np.issubdtype(image.dtype, bool):
+                raise ValueError(f"Input image dtype={image.dtype} cannot be boolean.")
+            if np.issubdtype(image.dtype, np.unsignedinteger):
+                image_dtype_max = np.iinfo(image.dtype).max
+                image = image.astype(np.float32)  # because torch does not have unsigned dtypes beyond torch.uint8
+            image = MarigoldImageProcessor.numpy_to_pt(image)
+
+        if torch.is_tensor(image) and not torch.is_floating_point(image) and image_dtype_max is None:
+            if image.dtype != torch.uint8:
+                raise ValueError(f"Image dtype={image.dtype} is not supported.")
+            image_dtype_max = 255
+
+        if not torch.is_tensor(image):
+            raise ValueError(f"Input type unsupported: {type(image)}.")
+
+        if image.shape[1] == 1:
+            image = image.repeat(1, 3, 1, 1)  # [N,1,H,W] -> [N,3,H,W]
+        if image.shape[1] != 3:
+            raise ValueError(f"Input image is not 1- or 3-channel: {image.shape}.")
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image_dtype_max is not None:
+            image = image / image_dtype_max
+
+        return image
+
+    @staticmethod
+    def check_image_values_range(image: torch.Tensor) -> None:
+        if not torch.is_tensor(image):
+            raise ValueError(f"Invalid input type={type(image)}.")
+        if not torch.is_floating_point(image):
+            raise ValueError(f"Invalid input dtype={image.dtype}.")
+        if image.min().item() < 0.0 or image.max().item() > 1.0:
+            raise ValueError("Input image data is partially outside of the [0,1] range.")
+
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        processing_resolution: Optional[int] = None,
+        resample_method_input: str = "bilinear",
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ):
+        if isinstance(image, list):
+            images = None
+            for i, img in enumerate(image):
+                img = self.load_image_canonical(img, device, dtype)  # [N,3,H,W]
+                if images is None:
+                    images = img
+                else:
+                    if images.shape[2:] != img.shape[2:]:
+                        raise ValueError(
+                            f"Input image[{i}] has incompatible dimensions {img.shape[2:]} with the previous images "
+                            f"{images.shape[2:]}"
+                        )
+                    images = torch.cat((images, img), dim=0)
+            image = images
+            del images
+        else:
+            image = self.load_image_canonical(image, device, dtype)  # [N,3,H,W]
+
+        original_resolution = image.shape[2:]
+
+        if self.config.do_range_check:
+            self.check_image_values_range(image)
+
+        if self.config.do_normalize:
+            image = image * 2.0 - 1.0
+
+        if processing_resolution is not None and processing_resolution > 0:
+            image = self.resize_to_max_edge(image, processing_resolution, resample_method_input)  # [N,3,PH,PW]
+
+        image, padding = self.pad_image(image, self.config.vae_scale_factor)  # [N,3,PPH,PPW]
+
+        return image, padding, original_resolution
+
+    @staticmethod
+    def colormap(
+        image: Union[np.ndarray, torch.Tensor],
+        cmap: str = "Spectral",
+        bytes: bool = False,
+        _force_method: Optional[str] = None,
+    ) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
+        behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
+        "binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
+        the native implementation.
+
+        Args:
+            image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
+            cmap: Colormap name.
+            bytes: Whether to return the output as uint8 or floating point image.
+            _force_method:
+                Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
+                implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
+
+        Returns:
+            An RGB-colorized tensor corresponding to the input image.
+        """
+        if not (torch.is_tensor(image) or isinstance(image, np.ndarray)):
+            raise ValueError("Argument must be a numpy array or torch tensor.")
+        if _force_method not in (None, "matplotlib", "custom"):
+            raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
+
+        supported_cmaps = {
+            "binary": [
+                (1.0, 1.0, 1.0),
+                (0.0, 0.0, 0.0),
+            ],
+            "Spectral": [  # Taken from matplotlib/_cm.py
+                (0.61960784313725492, 0.003921568627450980, 0.25882352941176473),  # 0.0 -> [0]
+                (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
+                (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
+                (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
+                (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
+                (1.0, 1.0, 0.74901960784313726),
+                (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
+                (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
+                (0.4, 0.76078431372549016, 0.6470588235294118),
+                (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
+                (0.36862745098039218, 0.30980392156862746, 0.63529411764705879),  # 1.0 -> [K-1]
+            ],
+        }
+
+        def method_matplotlib(image, cmap, bytes=False):
+            if is_matplotlib_available():
+                import matplotlib
+            else:
+                return None
+
+            arg_is_pt, device = torch.is_tensor(image), None
+            if arg_is_pt:
+                image, device = image.cpu().numpy(), image.device
+
+            if cmap not in matplotlib.colormaps:
+                raise ValueError(
+                    f"Unexpected color map {cmap}; available options are: {', '.join(list(matplotlib.colormaps.keys()))}"
+                )
+
+            cmap = matplotlib.colormaps[cmap]
+            out = cmap(image, bytes=bytes)  # [?,4]
+            out = out[..., :3]  # [?,3]
+
+            if arg_is_pt:
+                out = torch.tensor(out, device=device)
+
+            return out
+
+        def method_custom(image, cmap, bytes=False):
+            arg_is_np = isinstance(image, np.ndarray)
+            if arg_is_np:
+                image = torch.tensor(image)
+            if image.dtype == torch.uint8:
+                image = image.float() / 255
+            else:
+                image = image.float()
+
+            is_cmap_reversed = cmap.endswith("_r")
+            if is_cmap_reversed:
+                cmap = cmap[:-2]
+
+            if cmap not in supported_cmaps:
+                raise ValueError(
+                    f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
+                )
+
+            cmap = supported_cmaps[cmap]
+            if is_cmap_reversed:
+                cmap = cmap[::-1]
+            cmap = torch.tensor(cmap, dtype=torch.float, device=image.device)  # [K,3]
+            K = cmap.shape[0]
+
+            pos = image.clamp(min=0, max=1) * (K - 1)
+            left = pos.long()
+            right = (left + 1).clamp(max=K - 1)
+
+            d = (pos - left.float()).unsqueeze(-1)
+            left_colors = cmap[left]
+            right_colors = cmap[right]
+
+            out = (1 - d) * left_colors + d * right_colors
+
+            if bytes:
+                out = (out * 255).to(torch.uint8)
+
+            if arg_is_np:
+                out = out.numpy()
+
+            return out
+
+        if _force_method is None and torch.is_tensor(image) and cmap == "Spectral":
+            return method_custom(image, cmap, bytes)
+
+        out = None
+        if _force_method != "custom":
+            out = method_matplotlib(image, cmap, bytes)
+
+        if _force_method == "matplotlib" and out is None:
+            raise ImportError("Make sure to install matplotlib if you want to use a color map other than 'Spectral'.")
+
+        if out is None:
+            out = method_custom(image, cmap, bytes)
+
+        return out
+
+    @staticmethod
+    def visualize_depth(
+        depth: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.Tensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        val_min: float = 0.0,
+        val_max: float = 1.0,
+        color_map: str = "Spectral",
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.
+
+        Args:
+            depth (`Union[PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray],
+                List[torch.Tensor]]`): Depth maps.
+            val_min (`float`, *optional*, defaults to `0.0`): Minimum value of the visualized depth range.
+            val_max (`float`, *optional*, defaults to `1.0`): Maximum value of the visualized depth range.
+            color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
+                      depth prediction into colored representation.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
+        """
+        if val_max <= val_min:
+            raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
+
+        def visualize_depth_one(img, idx=None):
+            prefix = "Depth" + (f"[{idx}]" if idx else "")
+            if isinstance(img, PIL.Image.Image):
+                if img.mode != "I;16":
+                    raise ValueError(f"{prefix}: invalid PIL mode={img.mode}.")
+                img = np.array(img).astype(np.float32) / (2**16 - 1)
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim != 2:
+                    raise ValueError(f"{prefix}: unexpected shape={img.shape}.")
+                if isinstance(img, np.ndarray):
+                    img = torch.from_numpy(img)
+                if not torch.is_floating_point(img):
+                    raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
+            else:
+                raise ValueError(f"{prefix}: unexpected type={type(img)}.")
+            if val_min != 0.0 or val_max != 1.0:
+                img = (img - val_min) / (val_max - val_min)
+            img = MarigoldImageProcessor.colormap(img, cmap=color_map, bytes=True)  # [H,W,3]
+            img = PIL.Image.fromarray(img.cpu().numpy())
+            return img
+
+        if depth is None or isinstance(depth, list) and any(o is None for o in depth):
+            raise ValueError("Input depth is `None`")
+        if isinstance(depth, (np.ndarray, torch.Tensor)):
+            depth = MarigoldImageProcessor.expand_tensor_or_array(depth)
+            if isinstance(depth, np.ndarray):
+                depth = MarigoldImageProcessor.numpy_to_pt(depth)  # [N,H,W,1] -> [N,1,H,W]
+            if not (depth.ndim == 4 and depth.shape[1] == 1):  # [N,1,H,W]
+                raise ValueError(f"Unexpected input shape={depth.shape}, expecting [N,1,H,W].")
+            return [visualize_depth_one(img[0], idx) for idx, img in enumerate(depth)]
+        elif isinstance(depth, list):
+            return [visualize_depth_one(img, idx) for idx, img in enumerate(depth)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(depth)}")
+
+    @staticmethod
+    def export_depth_to_16bit_png(
+        depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+        val_min: float = 0.0,
+        val_max: float = 1.0,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        def export_depth_to_16bit_png_one(img, idx=None):
+            prefix = "Depth" + (f"[{idx}]" if idx else "")
+            if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
+                raise ValueError(f"{prefix}: unexpected type={type(img)}.")
+            if img.ndim != 2:
+                raise ValueError(f"{prefix}: unexpected shape={img.shape}.")
+            if torch.is_tensor(img):
+                img = img.cpu().numpy()
+            if not np.issubdtype(img.dtype, np.floating):
+                raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
+            if val_min != 0.0 or val_max != 1.0:
+                img = (img - val_min) / (val_max - val_min)
+            img = (img * (2**16 - 1)).astype(np.uint16)
+            img = PIL.Image.fromarray(img, mode="I;16")
+            return img
+
+        if depth is None or isinstance(depth, list) and any(o is None for o in depth):
+            raise ValueError("Input depth is `None`")
+        if isinstance(depth, (np.ndarray, torch.Tensor)):
+            depth = MarigoldImageProcessor.expand_tensor_or_array(depth)
+            if isinstance(depth, np.ndarray):
+                depth = MarigoldImageProcessor.numpy_to_pt(depth)  # [N,H,W,1] -> [N,1,H,W]
+            if not (depth.ndim == 4 and depth.shape[1] == 1):
+                raise ValueError(f"Unexpected input shape={depth.shape}, expecting [N,1,H,W].")
+            return [export_depth_to_16bit_png_one(img[0], idx) for idx, img in enumerate(depth)]
+        elif isinstance(depth, list):
+            return [export_depth_to_16bit_png_one(img, idx) for idx, img in enumerate(depth)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(depth)}")
+
+    @staticmethod
+    def visualize_normals(
+        normals: Union[
+            np.ndarray,
+            torch.Tensor,
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        flip_x: bool = False,
+        flip_y: bool = False,
+        flip_z: bool = False,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.
+
+        Args:
+            normals (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
+                Surface normals.
+            flip_x (`bool`, *optional*, defaults to `False`): Flips the X axis of the normals frame of reference.
+                      Default direction is right.
+            flip_y (`bool`, *optional*, defaults to `False`):  Flips the Y axis of the normals frame of reference.
+                      Default direction is top.
+            flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
+                      Default direction is facing the observer.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
+        """
+        flip_vec = None
+        if any((flip_x, flip_y, flip_z)):
+            flip_vec = torch.tensor(
+                [
+                    (-1) ** flip_x,
+                    (-1) ** flip_y,
+                    (-1) ** flip_z,
+                ],
+                dtype=torch.float32,
+            )
+
+        def visualize_normals_one(img, idx=None):
+            img = img.permute(1, 2, 0)
+            if flip_vec is not None:
+                img *= flip_vec.to(img.device)
+            img = (img + 1.0) * 0.5
+            img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
+            img = PIL.Image.fromarray(img)
+            return img
+
+        if normals is None or isinstance(normals, list) and any(o is None for o in normals):
+            raise ValueError("Input normals is `None`")
+        if isinstance(normals, (np.ndarray, torch.Tensor)):
+            normals = MarigoldImageProcessor.expand_tensor_or_array(normals)
+            if isinstance(normals, np.ndarray):
+                normals = MarigoldImageProcessor.numpy_to_pt(normals)  # [N,3,H,W]
+            if not (normals.ndim == 4 and normals.shape[1] == 3):
+                raise ValueError(f"Unexpected input shape={normals.shape}, expecting [N,3,H,W].")
+            return [visualize_normals_one(img, idx) for idx, img in enumerate(normals)]
+        elif isinstance(normals, list):
+            return [visualize_normals_one(img, idx) for idx, img in enumerate(normals)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(normals)}")
+
+    @staticmethod
+    def visualize_uncertainty(
+        uncertainty: Union[
+            np.ndarray,
+            torch.Tensor,
+            List[np.ndarray],
+            List[torch.Tensor],
+        ],
+        saturation_percentile=95,
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
+        """
+        Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.
+
+        Args:
+            uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
+                Uncertainty maps.
+            saturation_percentile (`int`, *optional*, defaults to `95`):
+                Specifies the percentile uncertainty value visualized with maximum intensity.
+
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
+        """
+
+        def visualize_uncertainty_one(img, idx=None):
+            prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
+            if img.min() < 0:
+                raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
+            img = img.squeeze(0).cpu().numpy()
+            saturation_value = np.percentile(img, saturation_percentile)
+            img = np.clip(img * 255 / saturation_value, 0, 255)
+            img = img.astype(np.uint8)
+            img = PIL.Image.fromarray(img)
+            return img
+
+        if uncertainty is None or isinstance(uncertainty, list) and any(o is None for o in uncertainty):
+            raise ValueError("Input uncertainty is `None`")
+        if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
+            uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
+            if isinstance(uncertainty, np.ndarray):
+                uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty)  # [N,1,H,W]
+            if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
+                raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
+            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
+        elif isinstance(uncertainty, list):
+            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
+        else:
+            raise ValueError(f"Unexpected input type: {type(uncertainty)}")
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
new file mode 100644
index 000000000000..a602ba611ea5
--- /dev/null
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -0,0 +1,813 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput
+from ...models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    LCMScheduler,
+)
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.import_utils import is_scipy_available
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .marigold_image_processing import MarigoldImageProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+
+>>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+...     "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> depth = pipe(image)
+
+>>> vis = pipe.image_processor.visualize_depth(depth.prediction)
+>>> vis[0].save("einstein_depth.png")
+
+>>> depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
+>>> depth_16bit[0].save("einstein_depth_16bit.png")
+```
+"""
+
+
+@dataclass
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+
+    prediction: Union[np.ndarray, torch.Tensor]
+    uncertainty: Union[None, np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+
+
+class MarigoldDepthPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using the Marigold method: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
+            representations.
+        scheduler (`DDIMScheduler` or `LCMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        prediction_type (`str`, *optional*):
+            Type of predictions made by the model.
+        scale_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
+            the model config. When used together with the `shift_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        shift_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
+            the model config. When used together with the `scale_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    supported_prediction_types = ("depth", "disparity")
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        prediction_type: Optional[str] = None,
+        scale_invariant: Optional[bool] = True,
+        shift_invariant: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if prediction_type not in self.supported_prediction_types:
+            logger.warning(
+                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
+                f"{self.supported_prediction_types}."
+            )
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            prediction_type=prediction_type,
+            scale_invariant=scale_invariant,
+            shift_invariant=shift_invariant,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.scale_invariant = scale_invariant
+        self.shift_invariant = shift_invariant
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+
+        self.empty_text_embedding = None
+
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size > 1 and (self.scale_invariant or self.shift_invariant) and not is_scipy_available():
+            raise ImportError("Make sure to install scipy if you want to use ensembling.")
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("mean", "median"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'mean'` or `'median'`.")
+
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+
+        return num_images
+
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the depth estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"median"` or `"mean"`.
+                - regularizer_strength (`float`, *optional*, defaults to `0.02`): Strength of the regularizer that
+                  pulls the aligned predictions to the unit range from 0 to 1.
+                - max_iter (`int`, *optional*, defaults to `2`): Maximum number of the alignment solver steps. Refer to
+                  `scipy.optimize.minimize` function, `options` argument.
+                - tol (`float`, *optional*, defaults to `1e-3`): Alignment solver tolerance. The solver stops when the
+                  tolerance is reached.
+                - max_res (`int`, *optional*, defaults to `None`): Resolution at which the alignment is performed;
+                  `None` matches the `processing_resolution`.
+            latents (`torch.Tensor`, or `List[torch.Tensor]`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.marigold.MarigoldDepthOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldDepthOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+
+        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        image, padding, original_resolution = self.image_processor.preprocess(
+            image, processing_resolution, resample_method_input, device, dtype
+        )  # [N,3,PPH,PPW]
+
+        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+
+        del image
+
+        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
+            batch_size, 1, 1
+        )  # [B,1024,2]
+
+        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
+        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
+        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
+        # model.
+        # Model invocation: self.unet.
+        pred_latents = []
+
+        for i in self.progress_bar(
+            range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
+        ):
+            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
+            batch_pred_latent = pred_latent[i : i + batch_size]  # [B,4,h,w]
+            effective_batch_size = batch_image_latent.shape[0]
+            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
+
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
+                batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,8,h,w]
+                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,4,h,w]
+                batch_pred_latent = self.scheduler.step(
+                    noise, t, batch_pred_latent, generator=generator
+                ).prev_sample  # [B,4,h,w]
+
+            pred_latents.append(batch_pred_latent)
+
+        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,4,h,w]
+
+        del (
+            pred_latents,
+            image_latent,
+            batch_empty_text_embedding,
+            batch_image_latent,
+            batch_pred_latent,
+            text,
+            batch_latent,
+            noise,
+        )
+
+        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
+        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.decoder.
+        prediction = torch.cat(
+            [
+                self.decode_prediction(pred_latent[i : i + batch_size])
+                for i in range(0, pred_latent.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N*E,1,PPH,PPW]
+
+        if not output_latent:
+            pred_latent = None
+
+        # 7. Remove padding. The output shape is (PH, PW).
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,1,PH,PW]
+
+        # 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N`
+        # groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
+        # `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
+        # each group independently, it stacks them respectively into batches of `N` almost final predictions and
+        # uncertainty maps.
+        uncertainty = None
+        if ensemble_size > 1:
+            prediction = prediction.reshape(num_images, ensemble_size, *prediction.shape[1:])  # [N,E,1,PH,PW]
+            prediction = [
+                self.ensemble_depth(
+                    prediction[i],
+                    self.scale_invariant,
+                    self.shift_invariant,
+                    output_uncertainty,
+                    **(ensembling_kwargs or {}),
+                )
+                for i in range(num_images)
+            ]  # [ [[1,1,PH,PW], [1,1,PH,PW]], ... ]
+            prediction, uncertainty = zip(*prediction)  # [[1,1,PH,PW], ... ], [[1,1,PH,PW], ... ]
+            prediction = torch.cat(prediction, dim=0)  # [N,1,PH,PW]
+            if output_uncertainty:
+                uncertainty = torch.cat(uncertainty, dim=0)  # [N,1,PH,PW]
+            else:
+                uncertainty = None
+
+        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
+        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
+        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
+        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,1,H,W]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.resize_antialias(
+                    uncertainty, original_resolution, resample_method_output, is_aa=False
+                )  # [N,1,H,W]
+
+        # 10. Prepare the final outputs.
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,1]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.pt_to_numpy(uncertainty)  # [N,H,W,1]
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (prediction, uncertainty, pred_latent)
+
+        return MarigoldDepthOutput(
+            prediction=prediction,
+            uncertainty=uncertainty,
+            latent=pred_latent,
+        )
+
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+
+        return image_latent, pred_latent
+
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+
+        prediction = prediction.mean(dim=1, keepdim=True)  # [B,1,H,W]
+        prediction = torch.clip(prediction, -1.0, 1.0)  # [B,1,H,W]
+        prediction = (prediction + 1.0) / 2.0
+
+        return prediction  # [B,1,H,W]
+
+    @staticmethod
+    def ensemble_depth(
+        depth: torch.Tensor,
+        scale_invariant: bool = True,
+        shift_invariant: bool = True,
+        output_uncertainty: bool = False,
+        reduction: str = "median",
+        regularizer_strength: float = 0.02,
+        max_iter: int = 2,
+        tol: float = 1e-3,
+        max_res: int = 1024,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the
+        number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for
+        depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The
+        alignment happens when the predictions have one or more degrees of freedom, that is when they are either
+        affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only
+        `scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`)
+        alignment is skipped and only ensembling is performed.
+
+        Args:
+            depth (`torch.Tensor`):
+                Input ensemble depth maps.
+            scale_invariant (`bool`, *optional*, defaults to `True`):
+                Whether to treat predictions as scale-invariant.
+            shift_invariant (`bool`, *optional*, defaults to `True`):
+                Whether to treat predictions as shift-invariant.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"median"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and
+                `"median"`.
+            regularizer_strength (`float`, *optional*, defaults to `0.02`):
+                Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1.
+            max_iter (`int`, *optional*, defaults to `2`):
+                Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options`
+                argument.
+            tol (`float`, *optional*, defaults to `1e-3`):
+                Alignment solver tolerance. The solver stops when the tolerance is reached.
+            max_res (`int`, *optional*, defaults to `1024`):
+                Resolution at which the alignment is performed; `None` matches the `processing_resolution`.
+        Returns:
+            A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape:
+            `(1, 1, H, W)`.
+        """
+        if depth.dim() != 4 or depth.shape[1] != 1:
+            raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.")
+        if reduction not in ("mean", "median"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        if not scale_invariant and shift_invariant:
+            raise ValueError("Pure shift-invariant ensembling is not supported.")
+
+        def init_param(depth: torch.Tensor):
+            init_min = depth.reshape(ensemble_size, -1).min(dim=1).values
+            init_max = depth.reshape(ensemble_size, -1).max(dim=1).values
+
+            if scale_invariant and shift_invariant:
+                init_s = 1.0 / (init_max - init_min).clamp(min=1e-6)
+                init_t = -init_s * init_min
+                param = torch.cat((init_s, init_t)).cpu().numpy()
+            elif scale_invariant:
+                init_s = 1.0 / init_max.clamp(min=1e-6)
+                param = init_s.cpu().numpy()
+            else:
+                raise ValueError("Unrecognized alignment.")
+
+            return param
+
+        def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor:
+            if scale_invariant and shift_invariant:
+                s, t = np.split(param, 2)
+                s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1)
+                t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1)
+                out = depth * s + t
+            elif scale_invariant:
+                s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1)
+                out = depth * s
+            else:
+                raise ValueError("Unrecognized alignment.")
+            return out
+
+        def ensemble(
+            depth_aligned: torch.Tensor, return_uncertainty: bool = False
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+            uncertainty = None
+            if reduction == "mean":
+                prediction = torch.mean(depth_aligned, dim=0, keepdim=True)
+                if return_uncertainty:
+                    uncertainty = torch.std(depth_aligned, dim=0, keepdim=True)
+            elif reduction == "median":
+                prediction = torch.median(depth_aligned, dim=0, keepdim=True).values
+                if return_uncertainty:
+                    uncertainty = torch.median(torch.abs(depth_aligned - prediction), dim=0, keepdim=True).values
+            else:
+                raise ValueError(f"Unrecognized reduction method: {reduction}.")
+            return prediction, uncertainty
+
+        def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
+            cost = 0.0
+            depth_aligned = align(depth, param)
+
+            for i, j in torch.combinations(torch.arange(ensemble_size)):
+                diff = depth_aligned[i] - depth_aligned[j]
+                cost += (diff**2).mean().sqrt().item()
+
+            if regularizer_strength > 0:
+                prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
+                err_near = (0.0 - prediction.min()).abs().item()
+                err_far = (1.0 - prediction.max()).abs().item()
+                cost += (err_near + err_far) * regularizer_strength
+
+            return cost
+
+        def compute_param(depth: torch.Tensor):
+            import scipy
+
+            depth_to_align = depth.to(torch.float32)
+            if max_res is not None and max(depth_to_align.shape[2:]) > max_res:
+                depth_to_align = MarigoldImageProcessor.resize_to_max_edge(depth_to_align, max_res, "nearest-exact")
+
+            param = init_param(depth_to_align)
+
+            res = scipy.optimize.minimize(
+                partial(cost_fn, depth=depth_to_align),
+                param,
+                method="BFGS",
+                tol=tol,
+                options={"maxiter": max_iter, "disp": False},
+            )
+
+            return res.x
+
+        requires_aligning = scale_invariant or shift_invariant
+        ensemble_size = depth.shape[0]
+
+        if requires_aligning:
+            param = compute_param(depth)
+            depth = align(depth, param)
+
+        depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty)
+
+        depth_max = depth.max()
+        if scale_invariant and shift_invariant:
+            depth_min = depth.min()
+        elif scale_invariant:
+            depth_min = 0
+        else:
+            raise ValueError("Unrecognized alignment.")
+        depth_range = (depth_max - depth_min).clamp(min=1e-6)
+        depth = (depth - depth_min) / depth_range
+        if output_uncertainty:
+            uncertainty /= depth_range
+
+        return depth, uncertainty  # [1,1,H,W], [1,1,H,W]
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
new file mode 100644
index 000000000000..aa9ad36ffc35
--- /dev/null
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -0,0 +1,690 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput
+from ...models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    LCMScheduler,
+)
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .marigold_image_processing import MarigoldImageProcessor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+
+>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> normals = pipe(image)
+
+>>> vis = pipe.image_processor.visualize_normals(normals.prediction)
+>>> vis[0].save("einstein_normals.png")
+```
+"""
+
+
+@dataclass
+class MarigoldNormalsOutput(BaseOutput):
+    """
+    Output class for Marigold monocular normals prediction pipeline.
+
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+
+    prediction: Union[np.ndarray, torch.Tensor]
+    uncertainty: Union[None, np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+
+
+class MarigoldNormalsPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the normals latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
+            representations.
+        scheduler (`DDIMScheduler` or `LCMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        prediction_type (`str`, *optional*):
+            Type of predictions made by the model.
+        use_full_z_range (`bool`, *optional*):
+            Whether the normals predicted by this model utilize the full range of the Z dimension, or only its positive
+            half.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    supported_prediction_types = ("normals",)
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        prediction_type: Optional[str] = None,
+        use_full_z_range: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if prediction_type not in self.supported_prediction_types:
+            logger.warning(
+                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
+                f"{self.supported_prediction_types}."
+            )
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            use_full_z_range=use_full_z_range,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.use_full_z_range = use_full_z_range
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+
+        self.empty_text_embedding = None
+
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("closest", "mean"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'closest'` or `'mean'`.")
+
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+
+        return num_images
+
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the normals estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"closest"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"closest"` or `"mean"`.
+            latents (`torch.Tensor`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.marigold.MarigoldNormalsOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldNormalsOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+
+        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        image, padding, original_resolution = self.image_processor.preprocess(
+            image, processing_resolution, resample_method_input, device, dtype
+        )  # [N,3,PPH,PPW]
+
+        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+
+        del image
+
+        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
+            batch_size, 1, 1
+        )  # [B,1024,2]
+
+        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
+        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
+        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
+        # model.
+        # Model invocation: self.unet.
+        pred_latents = []
+
+        for i in self.progress_bar(
+            range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
+        ):
+            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
+            batch_pred_latent = pred_latent[i : i + batch_size]  # [B,4,h,w]
+            effective_batch_size = batch_image_latent.shape[0]
+            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
+
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
+                batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,8,h,w]
+                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,4,h,w]
+                batch_pred_latent = self.scheduler.step(
+                    noise, t, batch_pred_latent, generator=generator
+                ).prev_sample  # [B,4,h,w]
+
+            pred_latents.append(batch_pred_latent)
+
+        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,4,h,w]
+
+        del (
+            pred_latents,
+            image_latent,
+            batch_empty_text_embedding,
+            batch_image_latent,
+            batch_pred_latent,
+            text,
+            batch_latent,
+            noise,
+        )
+
+        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
+        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.decoder.
+        prediction = torch.cat(
+            [
+                self.decode_prediction(pred_latent[i : i + batch_size])
+                for i in range(0, pred_latent.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N*E,3,PPH,PPW]
+
+        if not output_latent:
+            pred_latent = None
+
+        # 7. Remove padding. The output shape is (PH, PW).
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+
+        # 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N`
+        # groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
+        # `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
+        # each group independently, it stacks them respectively into batches of `N` almost final predictions and
+        # uncertainty maps.
+        uncertainty = None
+        if ensemble_size > 1:
+            prediction = prediction.reshape(num_images, ensemble_size, *prediction.shape[1:])  # [N,E,3,PH,PW]
+            prediction = [
+                self.ensemble_normals(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
+                for i in range(num_images)
+            ]  # [ [[1,3,PH,PW], [1,1,PH,PW]], ... ]
+            prediction, uncertainty = zip(*prediction)  # [[1,3,PH,PW], ... ], [[1,1,PH,PW], ... ]
+            prediction = torch.cat(prediction, dim=0)  # [N,3,PH,PW]
+            if output_uncertainty:
+                uncertainty = torch.cat(uncertainty, dim=0)  # [N,1,PH,PW]
+            else:
+                uncertainty = None
+
+        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
+        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
+        # After upsampling, the native resolution normal maps are renormalized to unit length to reduce the artifacts.
+        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
+        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,3,H,W]
+            prediction = self.normalize_normals(prediction)  # [N,3,H,W]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.resize_antialias(
+                    uncertainty, original_resolution, resample_method_output, is_aa=False
+                )  # [N,1,H,W]
+
+        # 10. Prepare the final outputs.
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,3]
+            if uncertainty is not None and output_uncertainty:
+                uncertainty = self.image_processor.pt_to_numpy(uncertainty)  # [N,H,W,1]
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (prediction, uncertainty, pred_latent)
+
+        return MarigoldNormalsOutput(
+            prediction=prediction,
+            uncertainty=uncertainty,
+            latent=pred_latent,
+        )
+
+    # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+
+        return image_latent, pred_latent
+
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+
+        prediction = torch.clip(prediction, -1.0, 1.0)
+
+        if not self.use_full_z_range:
+            prediction[:, 2, :, :] *= 0.5
+            prediction[:, 2, :, :] += 0.5
+
+        prediction = self.normalize_normals(prediction)  # [B,3,H,W]
+
+        return prediction  # [B,3,H,W]
+
+    @staticmethod
+    def normalize_normals(normals: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+
+        norm = torch.norm(normals, dim=1, keepdim=True)
+        normals /= norm.clamp(min=eps)
+
+        return normals
+
+    @staticmethod
+    def ensemble_normals(
+        normals: torch.Tensor, output_uncertainty: bool, reduction: str = "closest"
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the normals maps represented by the `normals` tensor with expected shape `(B, 3, H, W)`, where B is
+        the number of ensemble members for a given prediction of size `(H x W)`.
+
+        Args:
+            normals (`torch.Tensor`):
+                Input ensemble normals maps.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"closest"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"closest"` and
+                `"mean"`.
+
+        Returns:
+            A tensor of aligned and ensembled normals maps with shape `(1, 3, H, W)` and optionally a tensor of
+            uncertainties of shape `(1, 1, H, W)`.
+        """
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        if reduction not in ("closest", "mean"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+
+        mean_normals = normals.mean(dim=0, keepdim=True)  # [1,3,H,W]
+        mean_normals = MarigoldNormalsPipeline.normalize_normals(mean_normals)  # [1,3,H,W]
+
+        sim_cos = (mean_normals * normals).sum(dim=1, keepdim=True)  # [E,1,H,W]
+        sim_cos = sim_cos.clamp(-1, 1)  # required to avoid NaN in uncertainty with fp16
+
+        uncertainty = None
+        if output_uncertainty:
+            uncertainty = sim_cos.arccos()  # [E,1,H,W]
+            uncertainty = uncertainty.mean(dim=0, keepdim=True) / np.pi  # [1,1,H,W]
+
+        if reduction == "mean":
+            return mean_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+
+        closest_indices = sim_cos.argmax(dim=0, keepdim=True)  # [1,1,H,W]
+        closest_indices = closest_indices.repeat(1, 3, 1, 1)  # [1,3,H,W]
+        closest_normals = torch.gather(normals, 0, closest_indices)  # [1,3,H,W]
+
+        return closest_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
index 6d3f5c1e274d..355d22350287 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -394,7 +394,7 @@ def encode_prompt(
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
             uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
index 1db7e5d9ab8a..2d0efce5ef74 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
@@ -320,7 +320,7 @@ def encode_prompt(
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
             uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 04f91d758b94..7ab0a94e5677 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -68,6 +68,7 @@
     is_k_diffusion_available,
     is_k_diffusion_version,
     is_librosa_available,
+    is_matplotlib_available,
     is_note_seq_available,
     is_notebook,
     is_onnx_available,
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 0583cf839ff7..df436bc46c06 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -692,6 +692,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class MarigoldDepthPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class MarigoldNormalsPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class MusicLDMPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index b8ce2d7c0466..6f70f5888910 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -295,6 +295,13 @@
 except importlib_metadata.PackageNotFoundError:
     _torchvision_available = False
 
+_matplotlib_available = importlib.util.find_spec("matplotlib") is not None
+try:
+    _matplotlib_version = importlib_metadata.version("matplotlib")
+    logger.debug(f"Successfully imported matplotlib version {_matplotlib_version}")
+except importlib_metadata.PackageNotFoundError:
+    _matplotlib_available = False
+
 _timm_available = importlib.util.find_spec("timm") is not None
 if _timm_available:
     try:
@@ -425,6 +432,10 @@ def is_torchvision_available():
     return _torchvision_available
 
 
+def is_matplotlib_available():
+    return _matplotlib_available
+
+
 def is_safetensors_available():
     return _safetensors_available
 
diff --git a/src/diffusers/utils/logging.py b/src/diffusers/utils/logging.py
index 2e80d30f1311..6f93450c410c 100644
--- a/src/diffusers/utils/logging.py
+++ b/src/diffusers/utils/logging.py
@@ -82,7 +82,9 @@ def _configure_library_root_logger() -> None:
             # This library has already configured the library root logger.
             return
         _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
+
+        if sys.stderr:  # only if sys.stderr exists, e.g. when not using pythonw in windows
+            _default_handler.flush = sys.stderr.flush
 
         # Apply our default configuration to the library root logger.
         library_root_logger = _get_library_root_logger()
diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py
index dc303a35a8e3..62b114ba67e3 100644
--- a/src/diffusers/utils/state_dict_utils.py
+++ b/src/diffusers/utils/state_dict_utils.py
@@ -62,6 +62,8 @@ class StateDictType(enum.Enum):
     ".out_proj.lora_linear_layer.down": ".out_proj.lora_A",
     ".lora_linear_layer.up": ".lora_B",
     ".lora_linear_layer.down": ".lora_A",
+    "text_projection.lora.down.weight": "text_projection.lora_A.weight",
+    "text_projection.lora.up.weight": "text_projection.lora_B.weight",
 }
 
 DIFFUSERS_OLD_TO_PEFT = {
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
index 33aa6a10377b..ad33df964d5d 100644
--- a/tests/models/unets/test_models_unet_2d_condition.py
+++ b/tests/models/unets/test_models_unet_2d_condition.py
@@ -146,42 +146,64 @@ def create_ip_adapter_plus_state_dict(model):
     )
 
     ip_image_projection_state_dict = OrderedDict()
+    keys = [k for k in image_projection.state_dict() if "layers." in k]
+    print(keys)
     for k, v in image_projection.state_dict().items():
         if "2.to" in k:
             k = k.replace("2.to", "0.to")
-        elif "3.0.weight" in k:
-            k = k.replace("3.0.weight", "1.0.weight")
-        elif "3.0.bias" in k:
-            k = k.replace("3.0.bias", "1.0.bias")
-        elif "3.0.weight" in k:
-            k = k.replace("3.0.weight", "1.0.weight")
-        elif "3.1.net.0.proj.weight" in k:
-            k = k.replace("3.1.net.0.proj.weight", "1.1.weight")
-        elif "3.net.2.weight" in k:
-            k = k.replace("3.net.2.weight", "1.3.weight")
-        elif "layers.0.0" in k:
-            k = k.replace("layers.0.0", "layers.0.0.norm1")
-        elif "layers.0.1" in k:
-            k = k.replace("layers.0.1", "layers.0.0.norm2")
-        elif "layers.1.0" in k:
-            k = k.replace("layers.1.0", "layers.1.0.norm1")
-        elif "layers.1.1" in k:
-            k = k.replace("layers.1.1", "layers.1.0.norm2")
-        elif "layers.2.0" in k:
-            k = k.replace("layers.2.0", "layers.2.0.norm1")
-        elif "layers.2.1" in k:
-            k = k.replace("layers.2.1", "layers.2.0.norm2")
-
-        if "norm_cross" in k:
-            ip_image_projection_state_dict[k.replace("norm_cross", "norm1")] = v
-        elif "layer_norm" in k:
-            ip_image_projection_state_dict[k.replace("layer_norm", "norm2")] = v
-        elif "to_k" in k:
+        elif "layers.0.ln0" in k:
+            k = k.replace("layers.0.ln0", "layers.0.0.norm1")
+        elif "layers.0.ln1" in k:
+            k = k.replace("layers.0.ln1", "layers.0.0.norm2")
+        elif "layers.1.ln0" in k:
+            k = k.replace("layers.1.ln0", "layers.1.0.norm1")
+        elif "layers.1.ln1" in k:
+            k = k.replace("layers.1.ln1", "layers.1.0.norm2")
+        elif "layers.2.ln0" in k:
+            k = k.replace("layers.2.ln0", "layers.2.0.norm1")
+        elif "layers.2.ln1" in k:
+            k = k.replace("layers.2.ln1", "layers.2.0.norm2")
+        elif "layers.3.ln0" in k:
+            k = k.replace("layers.3.ln0", "layers.3.0.norm1")
+        elif "layers.3.ln1" in k:
+            k = k.replace("layers.3.ln1", "layers.3.0.norm2")
+        elif "to_q" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
+        elif "to_out.0" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
+            k = k.replace("to_out.0", "to_out")
+        else:
+            k = k.replace("0.ff.0", "0.1.0")
+            k = k.replace("0.ff.1.net.0.proj", "0.1.1")
+            k = k.replace("0.ff.1.net.2", "0.1.3")
+
+            k = k.replace("1.ff.0", "1.1.0")
+            k = k.replace("1.ff.1.net.0.proj", "1.1.1")
+            k = k.replace("1.ff.1.net.2", "1.1.3")
+
+            k = k.replace("2.ff.0", "2.1.0")
+            k = k.replace("2.ff.1.net.0.proj", "2.1.1")
+            k = k.replace("2.ff.1.net.2", "2.1.3")
+
+            k = k.replace("3.ff.0", "3.1.0")
+            k = k.replace("3.ff.1.net.0.proj", "3.1.1")
+            k = k.replace("3.ff.1.net.2", "3.1.3")
+
+        # if "norm_cross" in k:
+        #     ip_image_projection_state_dict[k.replace("norm_cross", "norm1")] = v
+        # elif "layer_norm" in k:
+        #     ip_image_projection_state_dict[k.replace("layer_norm", "norm2")] = v
+        if "to_k" in k:
+            parts = k.split(".")
+            parts[2] = "attn"
+            k = ".".join(parts)
             ip_image_projection_state_dict[k.replace("to_k", "to_kv")] = torch.cat([v, v], dim=0)
         elif "to_v" in k:
             continue
-        elif "to_out.0" in k:
-            ip_image_projection_state_dict[k.replace("to_out.0", "to_out")] = v
         else:
             ip_image_projection_state_dict[k] = v
 
diff --git a/tests/pipelines/marigold/__init__.py b/tests/pipelines/marigold/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py
new file mode 100644
index 000000000000..24d1981b8fb2
--- /dev/null
+++ b/tests/pipelines/marigold/test_marigold_depth.py
@@ -0,0 +1,459 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    LCMScheduler,
+    MarigoldDepthPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class MarigoldDepthPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = MarigoldDepthPipeline
+    params = frozenset(["image"])
+    batch_params = frozenset(["image"])
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
+    callback_cfg_params = frozenset([])
+    test_xformers_attention = False
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "output_type",
+        ]
+    )
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "prediction_type": "depth",
+            "scale_invariant": True,
+            "shift_invariant": True,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "num_inference_steps": 1,
+            "processing_resolution": 0,
+            "generator": generator,
+            "output_type": "np",
+        }
+        return inputs
+
+    def _test_marigold_depth(
+        self,
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
+        pipe_inputs.update(**pipe_kwargs)
+
+        prediction = pipe(**pipe_inputs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_inputs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, 32, 32, 1), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 1, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_inputs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_dummy_defaults(self):
+        self._test_marigold_depth(
+            expected_slice=np.array([0.4529, 0.5184, 0.4985, 0.4355, 0.4273, 0.4153, 0.5229, 0.4818, 0.4627]),
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4529, 0.5184, 0.4985, 0.4355, 0.4273, 0.4153, 0.5229, 0.4818, 0.4627]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4511, 0.4531, 0.4542, 0.5024, 0.4987, 0.4969, 0.5281, 0.5215, 0.5182]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=2024,
+            expected_slice=np.array([0.4671, 0.4739, 0.5130, 0.4308, 0.4411, 0.4720, 0.5064, 0.4796, 0.4795]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4165, 0.4485, 0.4647, 0.4003, 0.4577, 0.5074, 0.5106, 0.5077, 0.5042]),
+            num_inference_steps=2,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.4817, 0.5425, 0.5146, 0.5367, 0.5034, 0.4743, 0.4395, 0.4734, 0.4399]),
+            num_inference_steps=1,
+            processing_resolution=64,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.3260, 0.3591, 0.2837, 0.2971, 0.2750, 0.2426, 0.4200, 0.3588, 0.3254]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.3180, 0.4194, 0.3013, 0.2902, 0.3245, 0.2897, 0.4718, 0.4174, 0.3705]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
+        self._test_marigold_depth(
+            generator_seed=0,
+            expected_slice=np.array([0.5515, 0.4588, 0.4197, 0.4741, 0.4229, 0.4328, 0.5333, 0.5314, 0.5182]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
+
+    def test_marigold_depth_dummy_no_num_inference_steps(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_depth(
+                num_inference_steps=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("num_inference_steps", str(e))
+
+    def test_marigold_depth_dummy_no_processing_resolution(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_depth(
+                processing_resolution=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("processing_resolution", str(e))
+
+
+@slow
+@require_torch_gpu
+class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _test_marigold_depth(
+        self,
+        is_fp16: bool = True,
+        device: str = "cuda",
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        model_id: str = "prs-eth/marigold-lcm-v1-0",
+        image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        from_pretrained_kwargs = {}
+        if is_fp16:
+            from_pretrained_kwargs["variant"] = "fp16"
+            from_pretrained_kwargs["torch_dtype"] = torch.float16
+
+        pipe = MarigoldDepthPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
+        if device == "cuda":
+            pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(generator_seed)
+
+        image = load_image(image_url)
+        width, height = image.size
+
+        prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_kwargs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, height, width, 1), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 1, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_kwargs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=False,
+            device="cpu",
+            generator_seed=0,
+            expected_slice=np.array([0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323, 0.4323]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=False,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1244, 0.1265, 0.1292, 0.1240, 0.1252, 0.1266, 0.1246, 0.1226, 0.1180]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1241, 0.1262, 0.1290, 0.1238, 0.1250, 0.1265, 0.1244, 0.1225, 0.1179]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=2024,
+            expected_slice=np.array([0.1710, 0.1725, 0.1738, 0.1700, 0.1700, 0.1696, 0.1698, 0.1663, 0.1592]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]),
+            num_inference_steps=2,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.2683, 0.2693, 0.2698, 0.2666, 0.2632, 0.2615, 0.2656, 0.2603, 0.2573]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1200, 0.1215, 0.1237, 0.1193, 0.1197, 0.1202, 0.1196, 0.1166, 0.1109]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.1121, 0.1135, 0.1155, 0.1111, 0.1115, 0.1118, 0.1111, 0.1079, 0.1019]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
+        self._test_marigold_depth(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.2671, 0.2690, 0.2720, 0.2659, 0.2676, 0.2739, 0.2664, 0.2686, 0.2573]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
diff --git a/tests/pipelines/marigold/test_marigold_normals.py b/tests/pipelines/marigold/test_marigold_normals.py
new file mode 100644
index 000000000000..c86c600be8e5
--- /dev/null
+++ b/tests/pipelines/marigold/test_marigold_normals.py
@@ -0,0 +1,459 @@
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# Marigold project website: https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    AutoencoderTiny,
+    LCMScheduler,
+    MarigoldNormalsPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class MarigoldNormalsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = MarigoldNormalsPipeline
+    params = frozenset(["image"])
+    batch_params = frozenset(["image"])
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
+    callback_cfg_params = frozenset([])
+    test_xformers_attention = False
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "output_type",
+        ]
+    )
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=8,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        torch.manual_seed(0)
+        scheduler = LCMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            prediction_type="v_prediction",
+            set_alpha_to_one=False,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            thresholding=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "prediction_type": "normals",
+            "use_full_z_range": True,
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": image,
+            "num_inference_steps": 1,
+            "processing_resolution": 0,
+            "generator": generator,
+            "output_type": "np",
+        }
+        return inputs
+
+    def _test_marigold_normals(
+        self,
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
+        pipe_inputs.update(**pipe_kwargs)
+
+        prediction = pipe(**pipe_inputs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_inputs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, 32, 32, 3), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 3, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_inputs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_depth_dummy_defaults(self):
+        self._test_marigold_normals(
+            expected_slice=np.array([0.0967, 0.5234, 0.1448, -0.3155, -0.2550, -0.5578, 0.6854, 0.5657, -0.1263]),
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.0967, 0.5234, 0.1448, -0.3155, -0.2550, -0.5578, 0.6854, 0.5657, -0.1263]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([-0.4128, -0.5918, -0.6540, 0.2446, -0.2687, -0.4607, 0.2935, -0.0483, -0.2086]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=2024,
+            expected_slice=np.array([0.5731, -0.7631, -0.0199, 0.1609, -0.4628, -0.7044, 0.5761, -0.3471, -0.4498]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.1017, -0.6823, -0.2533, 0.1988, 0.3389, 0.8478, 0.7757, 0.5220, 0.8668]),
+            num_inference_steps=2,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([-0.2391, 0.7969, 0.6224, 0.0698, 0.5669, -0.2167, -0.1362, -0.8945, -0.5501]),
+            num_inference_steps=1,
+            processing_resolution=64,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.3826, -0.9634, -0.3835, 0.3514, 0.0691, -0.6182, 0.8709, 0.1590, -0.2181]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.2500, -0.3928, -0.2415, 0.1133, 0.2357, -0.4223, 0.9967, 0.4859, -0.1282]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
+        self._test_marigold_normals(
+            generator_seed=0,
+            expected_slice=np.array([0.9588, 0.3326, -0.0825, -0.0994, -0.3534, -0.4302, 0.3562, 0.4421, -0.2086]),
+            num_inference_steps=1,
+            processing_resolution=16,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )
+
+    def test_marigold_depth_dummy_no_num_inference_steps(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_normals(
+                num_inference_steps=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("num_inference_steps", str(e))
+
+    def test_marigold_depth_dummy_no_processing_resolution(self):
+        with self.assertRaises(ValueError) as e:
+            self._test_marigold_normals(
+                processing_resolution=None,
+                expected_slice=np.array([0.0]),
+            )
+            self.assertIn("processing_resolution", str(e))
+
+
+@slow
+@require_torch_gpu
+class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _test_marigold_normals(
+        self,
+        is_fp16: bool = True,
+        device: str = "cuda",
+        generator_seed: int = 0,
+        expected_slice: np.ndarray = None,
+        model_id: str = "prs-eth/marigold-normals-lcm-v0-1",
+        image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
+        atol: float = 1e-4,
+        **pipe_kwargs,
+    ):
+        from_pretrained_kwargs = {}
+        if is_fp16:
+            from_pretrained_kwargs["variant"] = "fp16"
+            from_pretrained_kwargs["torch_dtype"] = torch.float16
+
+        pipe = MarigoldNormalsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
+        if device == "cuda":
+            pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(generator_seed)
+
+        image = load_image(image_url)
+        width, height = image.size
+
+        prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
+
+        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
+
+        if pipe_kwargs.get("match_input_resolution", True):
+            self.assertEqual(prediction.shape, (1, height, width, 3), "Unexpected output resolution")
+        else:
+            self.assertTrue(prediction.shape[0] == 1 and prediction.shape[3] == 3, "Unexpected output dimensions")
+            self.assertEqual(
+                max(prediction.shape[1:3]),
+                pipe_kwargs.get("processing_resolution", 768),
+                "Unexpected output resolution",
+            )
+
+        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
+
+    def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=False,
+            device="cpu",
+            generator_seed=0,
+            expected_slice=np.array([0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971]),
+            num_inference_steps=1,
+            processing_resolution=32,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=False,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7980, 0.7952, 0.7914, 0.7931, 0.7871, 0.7816, 0.7844, 0.7710, 0.7601]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7979, 0.7949, 0.7915, 0.7930, 0.7871, 0.7817, 0.7842, 0.7710, 0.7603]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=2024,
+            expected_slice=np.array([0.8428, 0.8428, 0.8433, 0.8369, 0.8325, 0.8315, 0.8271, 0.8135, 0.8057]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7095, 0.7095, 0.7104, 0.7070, 0.7051, 0.7061, 0.7017, 0.6938, 0.6914]),
+            num_inference_steps=2,
+            processing_resolution=768,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7168, 0.7163, 0.7163, 0.7080, 0.7061, 0.7046, 0.7031, 0.7007, 0.6987]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7114, 0.7124, 0.7144, 0.7085, 0.7070, 0.7080, 0.7051, 0.6958, 0.6924]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=3,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=1,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7412, 0.7441, 0.7490, 0.7383, 0.7388, 0.7437, 0.7329, 0.7271, 0.7300]),
+            num_inference_steps=1,
+            processing_resolution=768,
+            ensemble_size=4,
+            ensembling_kwargs={"reduction": "mean"},
+            batch_size=2,
+            match_input_resolution=True,
+        )
+
+    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
+        self._test_marigold_normals(
+            is_fp16=True,
+            device="cuda",
+            generator_seed=0,
+            expected_slice=np.array([0.7188, 0.7144, 0.7134, 0.7178, 0.7207, 0.7222, 0.7231, 0.7041, 0.6987]),
+            num_inference_steps=1,
+            processing_resolution=512,
+            ensemble_size=1,
+            batch_size=1,
+            match_input_resolution=False,
+        )