[UPDATE] add default_denoising_steps, default_processing_resolution

prs-eth · May 24, 2024 · d129f6c · d129f6c
1 parent 528d02e
commit d129f6c
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -125,12 +125,10 @@ Activate the environment again after restarting the terminal session.
 
 ### 🚀 Run inference with LCM (faster)
 
-The [LCM checkpoint](https://huggingface.co/prs-eth/marigold-lcm-v1-0) is distilled from our original checkpoint towards faster inference speed (by reducing inference steps). The inference steps can be as few as 1 to 4:
+The [LCM checkpoint](https://huggingface.co/prs-eth/marigold-lcm-v1-0) is distilled from our original checkpoint towards faster inference speed (by reducing inference steps). The inference steps can be as few as 1 (default) to 4. Run with default LCM setting:
 
 ```bash
  python run.py \
-     --denoise_steps 4 \
-     --ensemble_size 5 \
      --input_rgb_dir input/in-the-wild_example \
      --output_dir output/in-the-wild_example_lcm
  ```
@@ -156,11 +154,11 @@ The default settings are optimized for the best result. However, the behavior of
 
 - Trade-offs between the **accuracy** and **speed** (for both options, larger values result in better accuracy at the cost of slower inference.)
   - `--ensemble_size`: Number of inference passes in the ensemble. For LCM `ensemble_size` is more important than `denoise_steps`. Default: ~~10~~ 5 (for LCM).
-  - `--denoise_steps`: Number of denoising steps of each inference pass. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps. Default: ~~10~~ 4 (for LCM).
+  - `--denoise_steps`: Number of denoising steps of each inference pass. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps. When unassigned (`None`), will read default setting from model config. Default: ~~10 4 (for LCM)~~ `None`.
 
 - By default, the inference script resizes input images to the *processing resolution*, and then resizes the prediction back to the original resolution. This gives the best quality, as Stable Diffusion, from which Marigold is derived, performs best at 768x768 resolution.  
   
-  - `--processing_res`: the processing resolution; set 0 to process the input resolution directly. Default: 768.
+  - `--processing_res`: the processing resolution; set as 0 to process the input resolution directly. When unassigned (`None`), will read default setting from model config. Default: ~~768~~ `None`.
   - `--output_processing_res`: produce output at the processing resolution instead of upsampling it to the input resolution. Default: False.
   - `--resample_method`: resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`. Default: `bilinear`.
 

diff --git a/marigold/marigold_pipeline.py b/marigold/marigold_pipeline.py
@@ -1,4 +1,5 @@
 # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+# Last modified: 2024-05-24
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +20,7 @@
 
 
 import logging
-from typing import Dict, Union
+from typing import Dict, Optional, Union
 
 import numpy as np
 import torch
@@ -85,6 +86,25 @@ class MarigoldPipeline(DiffusionPipeline):
             Text-encoder, for empty text embedding.
         tokenizer (`CLIPTokenizer`):
             CLIP tokenizer.
+        scale_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
+            the model config. When used together with the `shift_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        shift_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
+            the model config. When used together with the `scale_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
     """
 
     rgb_latent_scale_factor = 0.18215
@@ -97,23 +117,12 @@ def __init__(
         scheduler: Union[DDIMScheduler, LCMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        scale_invariant: bool = None,
-        shift_invariant: bool = None,
+        scale_invariant: Optional[bool] = True,
+        shift_invariant: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
     ):
         super().__init__()
-        if scale_invariant is None:
-            logging.warn(
-                "`scale_invariant` is required but not given, filled with `True`"
-            )
-            scale_invariant = True
-        if shift_invariant is None:
-            logging.warn(
-                "`shift_invariant` is required but not given, filled with `True`"
-            )
-            shift_invariant = True
-        self.scale_invariant = scale_invariant
-        self.shift_invariant = shift_invariant
-
         self.register_modules(
             unet=unet,
             vae=vae,
@@ -124,17 +133,24 @@ def __init__(
         self.register_to_config(
             scale_invariant=scale_invariant,
             shift_invariant=shift_invariant,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
         )
 
+        self.scale_invariant = scale_invariant
+        self.shift_invariant = shift_invariant
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+
         self.empty_text_embed = None
 
     @torch.no_grad()
     def __call__(
         self,
         input_image: Union[Image.Image, torch.Tensor],
-        denoising_steps: int = 10,
-        ensemble_size: int = 10,
-        processing_res: int = 768,
+        denoising_steps: Optional[int] = None,
+        ensemble_size: int = 5,
+        processing_res: Optional[int] = None,
         match_input_res: bool = True,
         resample_method: str = "bilinear",
         batch_size: int = 0,
@@ -149,18 +165,21 @@ def __call__(
         Args:
             input_image (`Image`):
                 Input RGB (or gray-scale) image.
-            processing_res (`int`, *optional*, defaults to `768`):
-                Maximum resolution of processing.
-                If set to 0: will not resize at all.
+            denoising_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            processing_res (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, processes at the original image resolution. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
             match_input_res (`bool`, *optional*, defaults to `True`):
                 Resize depth prediction to match input resolution.
                 Only valid if `processing_res` > 0.
             resample_method: (`str`, *optional*, defaults to `bilinear`):
                 Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
-            denoising_steps (`int`, *optional*, defaults to `10`):
-                Number of diffusion denoising steps (DDIM) during inference.
-            ensemble_size (`int`, *optional*, defaults to `10`):
-                Number of predictions to be ensembled.
             batch_size (`int`, *optional*, defaults to `0`):
                 Inference batch size, no bigger than `num_ensemble`.
                 If set to 0, the script will automatically decide the proper batch size.
@@ -183,6 +202,12 @@ def __call__(
             - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
                     coming from ensembling. None if `ensemble_size = 1`
         """
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if denoising_steps is None:
+            denoising_steps = self.default_denoising_steps
+        if processing_res is None:
+            processing_res = self.default_processing_resolution
+
         assert processing_res >= 0
         assert ensemble_size >= 1
 

diff --git a/run.py b/run.py
@@ -62,7 +62,7 @@
     parser.add_argument(
         "--denoise_steps",
         type=int,
-        default=4,
+        default=None,
         help="Diffusion denoising steps, more steps results in higher accuracy but slower inference speed. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps.",
     )
     parser.add_argument(
@@ -82,7 +82,7 @@
     parser.add_argument(
         "--processing_res",
         type=int,
-        default=768,
+        default=None,
         help="Maximum resolution of processing. 0 for using input image resolution. Default: 768.",
     )
     parser.add_argument(
@@ -153,14 +153,6 @@
         batch_size = 1  # set default batchsize
 
     # -------------------- Preparation --------------------
-    # Print out config
-    logging.info(
-        f"Inference settings: checkpoint = `{checkpoint_path}`, "
-        f"with denoise_steps = {denoise_steps}, ensemble_size = {ensemble_size}, "
-        f"processing resolution = {processing_res}, seed = {seed}; "
-        f"color_map = {color_map}."
-    )
-
     # Output directories
     output_dir_color = os.path.join(output_dir, "depth_colored")
     output_dir_tif = os.path.join(output_dir, "depth_bw")
@@ -210,7 +202,7 @@
         dtype = torch.float32
         variant = None
 
-    pipe = MarigoldPipeline.from_pretrained(
+    pipe: MarigoldPipeline = MarigoldPipeline.from_pretrained(
         checkpoint_path, variant=variant, torch_dtype=dtype
     )
 
@@ -224,6 +216,16 @@
         f"scale_invariant: {pipe.scale_invariant}, shift_invariant: {pipe.shift_invariant}"
     )
 
+    # Print out config
+    logging.info(
+        f"Inference settings: checkpoint = `{checkpoint_path}`, "
+        f"with denoise_steps = {denoise_steps or pipe.default_denoising_steps}, "
+        f"ensemble_size = {ensemble_size}, "
+        f"processing resolution = {processing_res or pipe.default_processing_resolution}, "
+        f"seed = {seed}; "
+        f"color_map = {color_map}."
+    )
+
     # -------------------- Inference and saving --------------------
     with torch.no_grad():
         os.makedirs(output_dir, exist_ok=True)