diff --git a/README.md b/README.md index 13ac069..8e45365 100644 --- a/README.md +++ b/README.md @@ -125,12 +125,10 @@ Activate the environment again after restarting the terminal session. ### 🚀 Run inference with LCM (faster) -The [LCM checkpoint](https://huggingface.co/prs-eth/marigold-lcm-v1-0) is distilled from our original checkpoint towards faster inference speed (by reducing inference steps). The inference steps can be as few as 1 to 4: +The [LCM checkpoint](https://huggingface.co/prs-eth/marigold-lcm-v1-0) is distilled from our original checkpoint towards faster inference speed (by reducing inference steps). The inference steps can be as few as 1 (default) to 4. Run with default LCM setting: ```bash python run.py \ - --denoise_steps 4 \ - --ensemble_size 5 \ --input_rgb_dir input/in-the-wild_example \ --output_dir output/in-the-wild_example_lcm ``` @@ -156,11 +154,11 @@ The default settings are optimized for the best result. However, the behavior of - Trade-offs between the **accuracy** and **speed** (for both options, larger values result in better accuracy at the cost of slower inference.) - `--ensemble_size`: Number of inference passes in the ensemble. For LCM `ensemble_size` is more important than `denoise_steps`. Default: ~~10~~ 5 (for LCM). - - `--denoise_steps`: Number of denoising steps of each inference pass. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps. Default: ~~10~~ 4 (for LCM). + - `--denoise_steps`: Number of denoising steps of each inference pass. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps. When unassigned (`None`), will read default setting from model config. Default: ~~10 4 (for LCM)~~ `None`. - By default, the inference script resizes input images to the *processing resolution*, and then resizes the prediction back to the original resolution. This gives the best quality, as Stable Diffusion, from which Marigold is derived, performs best at 768x768 resolution. - - `--processing_res`: the processing resolution; set 0 to process the input resolution directly. Default: 768. + - `--processing_res`: the processing resolution; set as 0 to process the input resolution directly. When unassigned (`None`), will read default setting from model config. Default: ~~768~~ `None`. - `--output_processing_res`: produce output at the processing resolution instead of upsampling it to the input resolution. Default: False. - `--resample_method`: resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`. Default: `bilinear`. diff --git a/infer.py b/infer.py index 3bc3eb5..5db976f 100644 --- a/infer.py +++ b/infer.py @@ -1,4 +1,4 @@ -# Last modified: 2024-04-15 +# Last modified: 2024-05-24 # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -213,6 +213,9 @@ def check_directory(directory): logging.debug("run without xformers") pipe = pipe.to(device) + logging.info( + f"scale_invariant: {pipe.scale_invariant}, shift_invariant: {pipe.shift_invariant}" + ) # -------------------- Inference and saving -------------------- with torch.no_grad(): diff --git a/marigold/marigold_pipeline.py b/marigold/marigold_pipeline.py index ec9ea4e..9410b32 100644 --- a/marigold/marigold_pipeline.py +++ b/marigold/marigold_pipeline.py @@ -1,4 +1,5 @@ # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved. +# Last modified: 2024-05-24 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +20,7 @@ import logging -from typing import Dict, Union +from typing import Dict, Optional, Union import numpy as np import torch @@ -33,13 +34,13 @@ from diffusers.utils import BaseOutput from PIL import Image from torch.utils.data import DataLoader, TensorDataset -from torchvision.transforms.functional import resize, pil_to_tensor from torchvision.transforms import InterpolationMode +from torchvision.transforms.functional import pil_to_tensor, resize from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer from .util.batchsize import find_batch_size -from .util.ensemble import ensemble_depths +from .util.ensemble import ensemble_depth from .util.image_util import ( chw2hwc, colorize_depth_maps, @@ -85,6 +86,25 @@ class MarigoldPipeline(DiffusionPipeline): Text-encoder, for empty text embedding. tokenizer (`CLIPTokenizer`): CLIP tokenizer. + scale_invariant (`bool`, *optional*): + A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in + the model config. When used together with the `shift_invariant=True` flag, the model is also called + "affine-invariant". NB: overriding this value is not supported. + shift_invariant (`bool`, *optional*): + A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in + the model config. When used together with the `scale_invariant=True` flag, the model is also called + "affine-invariant". NB: overriding this value is not supported. + default_denoising_steps (`int`, *optional*): + The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable + quality with the given model. This value must be set in the model config. When the pipeline is called + without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure + reasonable results with various model flavors compatible with the pipeline, such as those relying on very + short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`). + default_processing_resolution (`int`, *optional*): + The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in + the model config. When the pipeline is called without explicitly setting `processing_resolution`, the + default value is used. This is required to ensure reasonable results with various model flavors trained + with varying optimal processing resolution values. """ rgb_latent_scale_factor = 0.18215 @@ -97,9 +117,12 @@ def __init__( scheduler: Union[DDIMScheduler, LCMScheduler], text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, + scale_invariant: Optional[bool] = True, + shift_invariant: Optional[bool] = True, + default_denoising_steps: Optional[int] = None, + default_processing_resolution: Optional[int] = None, ): super().__init__() - self.register_modules( unet=unet, vae=vae, @@ -107,6 +130,17 @@ def __init__( text_encoder=text_encoder, tokenizer=tokenizer, ) + self.register_to_config( + scale_invariant=scale_invariant, + shift_invariant=shift_invariant, + default_denoising_steps=default_denoising_steps, + default_processing_resolution=default_processing_resolution, + ) + + self.scale_invariant = scale_invariant + self.shift_invariant = shift_invariant + self.default_denoising_steps = default_denoising_steps + self.default_processing_resolution = default_processing_resolution self.empty_text_embed = None @@ -114,9 +148,9 @@ def __init__( def __call__( self, input_image: Union[Image.Image, torch.Tensor], - denoising_steps: int = 10, - ensemble_size: int = 10, - processing_res: int = 768, + denoising_steps: Optional[int] = None, + ensemble_size: int = 5, + processing_res: Optional[int] = None, match_input_res: bool = True, resample_method: str = "bilinear", batch_size: int = 0, @@ -131,18 +165,21 @@ def __call__( Args: input_image (`Image`): Input RGB (or gray-scale) image. - processing_res (`int`, *optional*, defaults to `768`): - Maximum resolution of processing. - If set to 0: will not resize at all. + denoising_steps (`int`, *optional*, defaults to `None`): + Number of denoising diffusion steps during inference. The default value `None` results in automatic + selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4 + for Marigold-LCM models. + ensemble_size (`int`, *optional*, defaults to `10`): + Number of predictions to be ensembled. + processing_res (`int`, *optional*, defaults to `None`): + Effective processing resolution. When set to `0`, processes at the original image resolution. This + produces crisper predictions, but may also lead to the overall loss of global context. The default + value `None` resolves to the optimal value from the model config. match_input_res (`bool`, *optional*, defaults to `True`): Resize depth prediction to match input resolution. Only valid if `processing_res` > 0. resample_method: (`str`, *optional*, defaults to `bilinear`): Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`. - denoising_steps (`int`, *optional*, defaults to `10`): - Number of diffusion denoising steps (DDIM) during inference. - ensemble_size (`int`, *optional*, defaults to `10`): - Number of predictions to be ensembled. batch_size (`int`, *optional*, defaults to `0`): Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. @@ -152,6 +189,10 @@ def __call__( Display a progress bar of diffusion denoising. color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation): Colormap used to colorize the depth map. + scale_invariant (`str`, *optional*, defaults to `True`): + Flag of scale-invariant prediction, if True, scale will be adjusted from the raw prediction. + shift_invariant (`str`, *optional*, defaults to `True`): + Flag of shift-invariant prediction, if True, shift will be adjusted from the raw prediction, if False, near plane will be fixed at 0m. ensemble_kwargs (`dict`, *optional*, defaults to `None`): Arguments for detailed ensembling settings. Returns: @@ -161,6 +202,12 @@ def __call__( - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling. None if `ensemble_size = 1` """ + # Model-specific optimal default values leading to fast and reasonable results. + if denoising_steps is None: + denoising_steps = self.default_denoising_steps + if processing_res is None: + processing_res = self.default_processing_resolution + assert processing_res >= 0 assert ensemble_size >= 1 @@ -175,14 +222,15 @@ def __call__( input_image = input_image.convert("RGB") # convert to torch tensor [H, W, rgb] -> [rgb, H, W] rgb = pil_to_tensor(input_image) + rgb = rgb.unsqueeze(0) # [1, rgb, H, W] elif isinstance(input_image, torch.Tensor): - rgb = input_image.squeeze() + rgb = input_image else: raise TypeError(f"Unknown input type: {type(input_image) = }") input_size = rgb.shape assert ( - 3 == rgb.dim() and 3 == input_size[0] - ), f"Wrong input shape {input_size}, expected [rgb, H, W]" + 4 == rgb.dim() and 3 == input_size[-3] + ), f"Wrong input shape {input_size}, expected [1, rgb, H, W]" # Resize image if processing_res > 0: @@ -199,7 +247,7 @@ def __call__( # ----------------- Predicting depth ----------------- # Batch repeated input image - duplicated_rgb = torch.stack([rgb_norm] * ensemble_size) + duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1) single_rgb_dataset = TensorDataset(duplicated_rgb) if batch_size > 0: _bs = batch_size @@ -231,35 +279,36 @@ def __call__( generator=generator, ) depth_pred_ls.append(depth_pred_raw.detach()) - depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze() + depth_preds = torch.concat(depth_pred_ls, dim=0) torch.cuda.empty_cache() # clear vram cache for ensembling # ----------------- Test-time ensembling ----------------- if ensemble_size > 1: - depth_pred, pred_uncert = ensemble_depths( - depth_preds, **(ensemble_kwargs or {}) + depth_pred, pred_uncert = ensemble_depth( + depth_preds, + scale_invariant=self.scale_invariant, + shift_invariant=self.shift_invariant, + max_res=50, + **(ensemble_kwargs or {}), ) else: depth_pred = depth_preds pred_uncert = None - # ----------------- Post processing ----------------- - # Scale prediction to [0, 1] - min_d = torch.min(depth_pred) - max_d = torch.max(depth_pred) - depth_pred = (depth_pred - min_d) / (max_d - min_d) - # Resize back to original resolution if match_input_res: depth_pred = resize( - depth_pred.unsqueeze(0), - input_size[1:], + depth_pred, + input_size[-2:], interpolation=resample_method, antialias=True, - ).squeeze() + ) # Convert to numpy + depth_pred = depth_pred.squeeze() depth_pred = depth_pred.cpu().numpy() + if pred_uncert is not None: + pred_uncert = pred_uncert.squeeze().cpu().numpy() # Clip output range depth_pred = depth_pred.clip(0, 1) diff --git a/marigold/util/ensemble.py b/marigold/util/ensemble.py index 5a2908e..d8c087b 100644 --- a/marigold/util/ensemble.py +++ b/marigold/util/ensemble.py @@ -18,10 +18,13 @@ # -------------------------------------------------------------------------- +from functools import partial +from typing import Optional, Tuple + import numpy as np import torch -from scipy.optimize import minimize +from .image_util import get_tv_resample_method, resize_max_res def inter_distances(tensors: torch.Tensor): @@ -37,96 +40,161 @@ def inter_distances(tensors: torch.Tensor): return dist -def ensemble_depths( - input_images: torch.Tensor, +def ensemble_depth( + depth: torch.Tensor, + scale_invariant: bool = True, + shift_invariant: bool = True, + output_uncertainty: bool = False, + reduction: str = "median", regularizer_strength: float = 0.02, max_iter: int = 2, tol: float = 1e-3, - reduction: str = "median", - max_res: int = None, -): + max_res: int = 1024, +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ - To ensemble multiple affine-invariant depth images (up to scale and shift), - by aligning estimating the scale and shift + Ensembles depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the + number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for + depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The + alignment happens when the predictions have one or more degrees of freedom, that is when they are either + affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only + `scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`) + alignment is skipped and only ensembling is performed. + + Args: + depth (`torch.Tensor`): + Input ensemble depth maps. + scale_invariant (`bool`, *optional*, defaults to `True`): + Whether to treat predictions as scale-invariant. + shift_invariant (`bool`, *optional*, defaults to `True`): + Whether to treat predictions as shift-invariant. + output_uncertainty (`bool`, *optional*, defaults to `False`): + Whether to output uncertainty map. + reduction (`str`, *optional*, defaults to `"median"`): + Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and + `"median"`. + regularizer_strength (`float`, *optional*, defaults to `0.02`): + Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1. + max_iter (`int`, *optional*, defaults to `2`): + Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options` + argument. + tol (`float`, *optional*, defaults to `1e-3`): + Alignment solver tolerance. The solver stops when the tolerance is reached. + max_res (`int`, *optional*, defaults to `1024`): + Resolution at which the alignment is performed; `None` matches the `processing_resolution`. + Returns: + A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape: + `(1, 1, H, W)`. """ - device = input_images.device - dtype = input_images.dtype - np_dtype = np.float32 - - original_input = input_images.clone() - n_img = input_images.shape[0] - ori_shape = input_images.shape - - if max_res is not None: - scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:])) - if scale_factor < 1: - downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest") - input_images = downscaler(input_images) - - # init guess - _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) - _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) - s_init = 1.0 / (_max - _min).reshape((-1, 1, 1)) - t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1)) - x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype) - - input_images = input_images.to(device) - - # objective function - def closure(x): - len_x = len(x) - s = x[: int(len_x / 2)] - t = x[int(len_x / 2) :] - s = torch.from_numpy(s).to(dtype=dtype).to(device) - t = torch.from_numpy(t).to(dtype=dtype).to(device) - - transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1)) - dists = inter_distances(transformed_arrays) - sqrt_dist = torch.sqrt(torch.mean(dists**2)) - - if "mean" == reduction: - pred = torch.mean(transformed_arrays, dim=0) - elif "median" == reduction: - pred = torch.median(transformed_arrays, dim=0).values + if depth.dim() != 4 or depth.shape[1] != 1: + raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.") + if reduction not in ("mean", "median"): + raise ValueError(f"Unrecognized reduction method: {reduction}.") + if not scale_invariant and shift_invariant: + raise ValueError("Pure shift-invariant ensembling is not supported.") + + def init_param(depth: torch.Tensor): + init_min = depth.reshape(ensemble_size, -1).min(dim=1).values + init_max = depth.reshape(ensemble_size, -1).max(dim=1).values + + if scale_invariant and shift_invariant: + init_s = 1.0 / (init_max - init_min).clamp(min=1e-6) + init_t = -init_s * init_min + param = torch.cat((init_s, init_t)).cpu().numpy() + elif scale_invariant: + init_s = 1.0 / init_max.clamp(min=1e-6) + param = init_s.cpu().numpy() else: - raise ValueError - - near_err = torch.sqrt((0 - torch.min(pred)) ** 2) - far_err = torch.sqrt((1 - torch.max(pred)) ** 2) - - err = sqrt_dist + (near_err + far_err) * regularizer_strength - err = err.detach().cpu().numpy().astype(np_dtype) - return err - - res = minimize( - closure, x, method="BFGS", tol=tol, options={"maxiter": max_iter, "disp": False} - ) - x = res.x - len_x = len(x) - s = x[: int(len_x / 2)] - t = x[int(len_x / 2) :] - - # Prediction - s = torch.from_numpy(s).to(dtype=dtype).to(device) - t = torch.from_numpy(t).to(dtype=dtype).to(device) - transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1) - if "mean" == reduction: - aligned_images = torch.mean(transformed_arrays, dim=0) - std = torch.std(transformed_arrays, dim=0) - uncertainty = std - elif "median" == reduction: - aligned_images = torch.median(transformed_arrays, dim=0).values - # MAD (median absolute deviation) as uncertainty indicator - abs_dev = torch.abs(transformed_arrays - aligned_images) - mad = torch.median(abs_dev, dim=0).values - uncertainty = mad - else: - raise ValueError(f"Unknown reduction method: {reduction}") + raise ValueError("Unrecognized alignment.") + + return param + + def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor: + if scale_invariant and shift_invariant: + s, t = np.split(param, 2) + s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1) + t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1) + out = depth * s + t + elif scale_invariant: + s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1) + out = depth * s + else: + raise ValueError("Unrecognized alignment.") + return out + + def ensemble( + depth_aligned: torch.Tensor, return_uncertainty: bool = False + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + uncertainty = None + if reduction == "mean": + prediction = torch.mean(depth_aligned, dim=0, keepdim=True) + if return_uncertainty: + uncertainty = torch.std(depth_aligned, dim=0, keepdim=True) + elif reduction == "median": + prediction = torch.median(depth_aligned, dim=0, keepdim=True).values + if return_uncertainty: + uncertainty = torch.median( + torch.abs(depth_aligned - prediction), dim=0, keepdim=True + ).values + else: + raise ValueError(f"Unrecognized reduction method: {reduction}.") + return prediction, uncertainty + + def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float: + cost = 0.0 + depth_aligned = align(depth, param) + + for i, j in torch.combinations(torch.arange(ensemble_size)): + diff = depth_aligned[i] - depth_aligned[j] + cost += (diff**2).mean().sqrt().item() + + if regularizer_strength > 0: + prediction, _ = ensemble(depth_aligned, return_uncertainty=False) + err_near = (0.0 - prediction.min()).abs().item() + err_far = (1.0 - prediction.max()).abs().item() + cost += (err_near + err_far) * regularizer_strength + + return cost - # Scale and shift to [0, 1] - _min = torch.min(aligned_images) - _max = torch.max(aligned_images) - aligned_images = (aligned_images - _min) / (_max - _min) - uncertainty /= _max - _min + def compute_param(depth: torch.Tensor): + import scipy + + depth_to_align = depth.to(torch.float32) + if max_res is not None and max(depth_to_align.shape[2:]) > max_res: + depth_to_align = resize_max_res( + depth_to_align, max_res, get_tv_resample_method("nearest-exact") + ) + + param = init_param(depth_to_align) + + res = scipy.optimize.minimize( + partial(cost_fn, depth=depth_to_align), + param, + method="BFGS", + tol=tol, + options={"maxiter": max_iter, "disp": False}, + ) + + return res.x + + requires_aligning = scale_invariant or shift_invariant + ensemble_size = depth.shape[0] + + if requires_aligning: + param = compute_param(depth) + depth = align(depth, param) + + depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty) + + depth_max = depth.max() + if scale_invariant and shift_invariant: + depth_min = depth.min() + elif scale_invariant: + depth_min = 0 + else: + raise ValueError("Unrecognized alignment.") + depth_range = (depth_max - depth_min).clamp(min=1e-6) + depth = (depth - depth_min) / depth_range + if output_uncertainty: + uncertainty /= depth_range - return aligned_images, uncertainty + return depth, uncertainty # [1,1,H,W], [1,1,H,W] diff --git a/marigold/util/image_util.py b/marigold/util/image_util.py index 90f0623..82078fe 100644 --- a/marigold/util/image_util.py +++ b/marigold/util/image_util.py @@ -1,5 +1,5 @@ # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved. -# Last modified: 2024-04-16 +# Last modified: 2024-05-24 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -86,7 +86,7 @@ def resize_max_res( Args: img (`torch.Tensor`): - Image tensor to be resized. + Image tensor to be resized. Expected shape: [B, C, H, W] max_edge_resolution (`int`): Maximum edge length (pixel). resample_method (`PIL.Image.Resampling`): @@ -95,8 +95,9 @@ def resize_max_res( Returns: `torch.Tensor`: Resized image. """ - assert 3 == img.dim() - _, original_height, original_width = img.shape + assert 4 == img.dim(), f"Invalid input shape {img.shape}" + + original_height, original_width = img.shape[-2:] downscale_factor = min( max_edge_resolution / original_width, max_edge_resolution / original_height ) @@ -113,6 +114,7 @@ def get_tv_resample_method(method_str: str) -> InterpolationMode: "bilinear": InterpolationMode.BILINEAR, "bicubic": InterpolationMode.BICUBIC, "nearest": InterpolationMode.NEAREST_EXACT, + "nearest-exact": InterpolationMode.NEAREST_EXACT, } resample_method = resample_method_dict.get(method_str, None) if resample_method is None: diff --git a/run.py b/run.py index 5274ca9..029e9fb 100644 --- a/run.py +++ b/run.py @@ -62,7 +62,7 @@ parser.add_argument( "--denoise_steps", type=int, - default=4, + default=None, help="Diffusion denoising steps, more steps results in higher accuracy but slower inference speed. For the original (DDIM) version, it's recommended to use 10-50 steps, while for LCM 1-4 steps.", ) parser.add_argument( @@ -82,7 +82,7 @@ parser.add_argument( "--processing_res", type=int, - default=768, + default=None, help="Maximum resolution of processing. 0 for using input image resolution. Default: 768.", ) parser.add_argument( @@ -153,14 +153,6 @@ batch_size = 1 # set default batchsize # -------------------- Preparation -------------------- - # Print out config - logging.info( - f"Inference settings: checkpoint = `{checkpoint_path}`, " - f"with denoise_steps = {denoise_steps}, ensemble_size = {ensemble_size}, " - f"processing resolution = {processing_res}, seed = {seed}; " - f"color_map = {color_map}." - ) - # Output directories output_dir_color = os.path.join(output_dir, "depth_colored") output_dir_tif = os.path.join(output_dir, "depth_bw") @@ -210,7 +202,7 @@ dtype = torch.float32 variant = None - pipe = MarigoldPipeline.from_pretrained( + pipe: MarigoldPipeline = MarigoldPipeline.from_pretrained( checkpoint_path, variant=variant, torch_dtype=dtype ) @@ -220,6 +212,19 @@ pass # run without xformers pipe = pipe.to(device) + logging.info( + f"scale_invariant: {pipe.scale_invariant}, shift_invariant: {pipe.shift_invariant}" + ) + + # Print out config + logging.info( + f"Inference settings: checkpoint = `{checkpoint_path}`, " + f"with denoise_steps = {denoise_steps or pipe.default_denoising_steps}, " + f"ensemble_size = {ensemble_size}, " + f"processing resolution = {processing_res or pipe.default_processing_resolution}, " + f"seed = {seed}; " + f"color_map = {color_map}." + ) # -------------------- Inference and saving -------------------- with torch.no_grad():