nandometzger

allow base image as input

c076e6d 3 months ago

61.7 kB

	# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# --------------------------------------------------------------------------
	# More information about Marigold:
	# https://marigoldmonodepth.github.io
	# https://marigoldcomputervision.github.io
	# Efficient inference pipelines are now part of diffusers:
	# https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage
	# https://huggingface.co/docs/diffusers/api/pipelines/marigold
	# Examples of trained models and live demos:
	# https://huggingface.co/prs-eth
	# Related projects:
	# https://rollingdepth.github.io/
	# https://marigolddepthcompletion.github.io/
	# Citation (BibTeX):
	# https://github.com/prs-eth/Marigold#-citation
	# If you find Marigold useful, we kindly ask you to cite our papers.
	# --------------------------------------------------------------------------

	import logging
	import numpy as np
	import torch
	from typing import Dict, Union
	import math

	from diffusers import (
	AutoencoderKL,
	DDIMScheduler,
	DiffusionPipeline,
	LCMScheduler,
	UNet2DConditionModel,
	AutoencoderTiny,
	)
	from diffusers.utils import BaseOutput
	from PIL import Image
	from torch.utils.data import DataLoader, TensorDataset
	from torchvision.transforms.functional import resize, pil_to_tensor
	from torchvision.transforms import InterpolationMode
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer
	from functools import partial
	from typing import Optional, Tuple


	class MarigoldDepthOutput(BaseOutput):
	"""
	Output class for Marigold Monocular Depth Estimation pipeline.

	Args:
	depth_np (`np.ndarray`):
	Predicted depth map, with depth values in the range of [0, 1].
	base_depth_np (`np.ndarray`):
	Upsampled base depth map, with depth values in the range of [0, 1].
	This is the depth map used as a global guidance for the boosted inference.
	It is upsampled to the same resolution as the final depth map.
	This is useful for visualization and debugging purposes.
	"""

	depth_np: np.ndarray
	base_depth_np: np.ndarray # NEW: upsampled base depth


	class MarigoldDepthHRPipeline(DiffusionPipeline):
	"""
	Pipeline for high resolution monocular depth estimation using Marigold: https://marigoldcomputervision.github.io.

	This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
	library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

	Args:
	unet (`UNet2DConditionModel`):
	Conditional U-Net to denoise the prediction latent, conditioned on image latent.
	vae (`AutoencoderKL`):
	Variational Auto-Encoder (VAE) Model to encode and decode images and predictions
	to and from latent representations.
	scheduler (`DDIMScheduler`):
	A scheduler to be used in combination with `unet` to denoise the encoded image latents.
	text_encoder (`CLIPTextModel`):
	Text-encoder, for empty text embedding.
	tokenizer (`CLIPTokenizer`):
	CLIP tokenizer.
	boosting_unet (`UNet2DConditionModel`):
	Conditional U-Net to denoise the depth latent, conditioned on image latent and a global depth map.
	scale_invariant (`bool`, optional):
	A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
	the model config. When used together with the `shift_invariant=True` flag, the model is also called
	"affine-invariant". NB: overriding this value is not supported.
	shift_invariant (`bool`, optional):
	A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
	the model config. When used together with the `scale_invariant=True` flag, the model is also called
	"affine-invariant". NB: overriding this value is not supported.
	default_denoising_steps (`int`, optional):
	The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
	quality with the given model. This value must be set in the model config. When the pipeline is called
	without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
	reasonable results with various model flavors compatible with the pipeline, such as those relying on very
	short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
	default_boosting_denoising_steps (`int`, optional):
	Same as `default_denoising_steps` but for `boosting_unet`.
	default_processing_resolution (`int`, optional):
	The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
	the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
	default value is used. This is required to ensure reasonable results with various model flavors trained
	with varying optimal processing resolution values.
	"""

	latent_scale_factor = 0.18215

	def __init__(
	self,
	unet: UNet2DConditionModel,
	vae: AutoencoderKL,
	scheduler: Union[DDIMScheduler, LCMScheduler],
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	boosting_unet: Optional[UNet2DConditionModel],
	scale_invariant: Optional[bool] = True,
	shift_invariant: Optional[bool] = True,
	default_denoising_steps: Optional[int] = None,
	default_boosting_denoising_steps: Optional[int] = None,
	default_processing_resolution: Optional[int] = None,
	base_depth_model_uri: Optional[str] = None,
	variant: Optional[str] = None,
	):
	super().__init__()

	if boosting_unet is None:
	logging.warning(
	"Boosting U-Net is not provided. If this message appears during training, it is expected."
	)

	self.register_modules(
	unet=unet,
	vae=vae,
	scheduler=scheduler,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	boosting_unet=boosting_unet,
	)
	self.register_to_config(
	scale_invariant=scale_invariant,
	shift_invariant=shift_invariant,
	default_denoising_steps=default_denoising_steps,
	default_boosting_denoising_steps=default_denoising_steps,
	default_processing_resolution=default_processing_resolution,
	)

	self.register_to_config(base_depth_model_uri=base_depth_model_uri)

	self.scale_invariant = scale_invariant
	self.shift_invariant = shift_invariant
	self.default_denoising_steps = default_denoising_steps
	self.default_boosting_denoising_steps = default_boosting_denoising_steps
	self.default_processing_resolution = default_processing_resolution

	if base_depth_model_uri is not None:
	# load the original LR depth model
	self.base_pipe = DiffusionPipeline.from_pretrained(
	base_depth_model_uri,
	variant=variant,
	torch_dtype=self.dtype,
	trust_remote_code=True,
	)
	self.base_pipe.to(self.device)
	else:
	self.base_pipe = None

	self.empty_text_embed = None

	@torch.no_grad()
	def __call__(
	self,
	input_image: Union[Image.Image, torch.Tensor],
	*,
	base_depth: Optional[Union[Image.Image, np.ndarray, torch.Tensor]] = None,
	denoising_steps: Optional[int] = None,
	boosted_denoising_steps: Optional[int] = None,
	ensemble_size: int = 10,
	boosted_ensemble_size: int = 5,
	processing_res: Optional[int] = None,
	match_input_res: bool = True,
	resample_method: str = "bilinear",
	batch_size: int = 0,
	show_progress_bar: bool = True,
	ensemble_kwargs: Dict = None,
	upscale_factor: int = 2,
	) -> MarigoldDepthOutput:
	"""
	Function invoked when calling the pipeline.

	Args:
	input_image (`Image` or `torch.Tensor`):
	Input RGB (or gray-scale) image.
	base_depth (`Image`, `np.ndarray`, `torch.Tensor` or `MarigoldDepthOutput`, optional):
	Base depth map to be used as a global guidance for the boosted inference.
	denoising_steps (`int`, optional, defaults to `10`):
	Number of diffusion denoising steps (DDIM) during inference.
	ensemble_size (`int`, optional, defaults to `10`):
	Number of predictions to be ensembled.
	boosted_ensemble_size (`int`, optional, defaults to `5`):
	Number of predictions to be ensembled in the boosted inference.
	processing_res (`int`, optional, defaults to `768`):
	Maximum resolution of processing.
	If set to 0: will not resize at all.
	match_input_res (`bool`, optional, defaults to `True`):
	Resize depth prediction to match input resolution.
	Only valid if `processing_res` > 0.
	resample_method: (`str`, optional, defaults to `bilinear`):
	Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
	batch_size (`int`, optional, defaults to `0`):
	Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size.
	show_progress_bar (`bool`, optional, defaults to `True`):
	Display a progress bar of diffusion denoising.
	scale_invariant (`str`, optional, defaults to `True`):
	Flag of scale-invariant prediction, if True, scale will be adjusted from the raw prediction.
	shift_invariant (`str`, optional, defaults to `True`):
	Flag of shift-invariant prediction, if True, shift will be adjusted from the raw prediction, if False,
	near plane will be fixed at 0m.
	ensemble_kwargs (`dict`, optional, defaults to `None`):
	Arguments for detailed ensembling settings.
	Returns:
	`MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
	- depth_np (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
	- base_depth_np (`np.ndarray`) Upsampled base depth map, with depth values in the range of [0, 1].
	This is the depth map used as a global guidance for the boosted inference.
	"""

	# Model-specific optimal default values leading to fast and reasonable results.
	if denoising_steps is None:
	denoising_steps = self.default_denoising_steps
	if boosted_denoising_steps is None:
	boosted_denoising_steps = self.default_boosting_denoising_steps
	if processing_res is None:
	processing_res = self.default_processing_resolution

	# Asserts
	assert processing_res >= 0, "Processing resolution must be non-negative."
	assert ensemble_size >= 1, "Ensemble size must be at least 1."
	assert boosted_ensemble_size >= 1, "Boosted ensemble size must be at least 1."
	assert math.log2(
	upscale_factor
	).is_integer(), "Upscale factor must be a power of 2."
	assert upscale_factor >= 1, "Upscale factor must be at least 2."
	assert batch_size >= 0, "Batch size must be non-negative."

	# Warnings
	if upscale_factor >= 8 and self.dtype == torch.float16:
	logging.warning(
	"Warning: Upscaling factors of 8 (and more) with half precision or more may lead to artifacts in the final prediction."
	)
	if upscale_factor >= 4 and isinstance(self.vae, AutoencoderTiny):
	logging.warning(
	"Warning: Upscaling factors of 4 (and more) with the Tiny VAE may lead to instabilities."
	)

	# Get the resolution of the input RGB image
	input_width, input_height = (
	input_image.size
	if isinstance(input_image, Image.Image)
	else input_image.shape[-2:]
	)

	# 1) get base prediction
	if base_depth is not None:
	# load into float32 np.ndarray
	if isinstance(base_depth, Image.Image):
	lowres = np.asarray(base_depth.convert("L"), dtype=np.float32)
	elif isinstance(base_depth, torch.Tensor):
	lowres = base_depth.squeeze().cpu().numpy().astype(np.float32)
	elif isinstance(base_depth, np.ndarray):
	lowres = base_depth.squeeze().astype(np.float32)
	elif isinstance(base_depth, MarigoldDepthOutput):
	lowres = base_depth.depth_np.astype(np.float32)
	else:
	raise TypeError(f"Unsupported base_depth type: {type(base_depth)}")

	# * min–max normalize to [0,1] *
	min_v, max_v = lowres.min(), lowres.max()
	eps = 1e-8
	if max_v - min_v > eps:
	lowres = (lowres - min_v) / (max_v - min_v + eps)
	else:
	# flat image → all zeros
	lowres = np.zeros_like(lowres, dtype=np.float32)
	else:
	assert self.base_pipe is not None
	if self.base_pipe.device != self.device:
	# Move the base pipe to the correct device
	self.base_pipe.to(self.device)

	base_out = self.base_pipe(
	input_image,
	num_inference_steps=denoising_steps,
	ensemble_size=ensemble_size,
	processing_resolution=processing_res,
	match_input_resolution=False,
	batch_size=1, # base inference is always done in batch size 1
	# show_progress_bar=show_progress_bar,
	resample_method_input=resample_method,
	resample_method_output=resample_method,
	ensembling_kwargs=ensemble_kwargs,
	)
	lowres = base_out.prediction[0,:,:,0] # [H, W]
	base_out.depth_np = lowres


	# 2) Upsample base for output
	t = torch.from_numpy(lowres[None,None])
	up = resize(t, (input_height, input_width),
	interpolation=InterpolationMode.NEAREST_EXACT,
	antialias=True)
	base_depth_np_upsampled = up.squeeze().cpu().numpy()

	# 3) If no boosting requested, return early
	if upscale_factor == 1:
	# If no upscaling is needed, return the base prediction
	return MarigoldDepthOutput(
	depth_np=lowres,
	base_depth_np=base_depth_np_upsampled
	)

	# 4) Normalize and run boosted inference (unchanged)
	global_pred = torch.from_numpy(lowres).to(self.device)
	global_pred = (global_pred - global_pred.min()) / (global_pred.max() - global_pred.min())

	# Iterative refinement logic
	current_pred = global_pred
	current_factor = 2 # Start with an upscale factor of 2

	# Create a list of all upscale factors up to the target
	upscale_factors = [2**i for i in range(1, int(math.log2(upscale_factor)) + 1)]

	# precalculate patch dimensions
	if processing_res == 0:
	processing_res = 768

	df = min(
	processing_res / input_image.width, processing_res / input_image.height
	)
	patch_height = int(input_image.height * df)
	patch_width = int(input_image.width * df)

	# Pre‐warn about any that exceed the 1.1× threshold
	for factor in upscale_factors:
	tw = patch_width * factor
	th = patch_height * factor
	if tw > input_width * 1.1 or th > input_height * 1.1:
	logging.warning(
	f"Warning: Attempting to upsample to {tw}×{th}, "
	f"which exceeds the original input of {input_width}×{input_height}. "
	"This technically works, but may lead to suboptimal results."
	)

	# 5) Perform iterative boosted inference
	with tqdm(
	total=len(upscale_factors),
	desc=" Upscaling Progress",
	unit="step",
	leave=False,
	) as pbar:
	for current_factor in upscale_factors:

	# Update the description with the current upscaling factor
	pbar.set_description(f" Upscaling x{current_factor}")

	# Determine if this is the final step
	is_final_step = current_factor == upscale_factor

	# 2. Perform a single boosted inference step
	boosted_output = self.boosted_inference(
	input_image=input_image,
	denoising_steps=boosted_denoising_steps,
	ensemble_size=(
	boosted_ensemble_size
	if current_factor < upscale_factors[-1]
	else boosted_ensemble_size
	),
	processing_res=processing_res,
	match_input_res=match_input_res and is_final_step,
	batch_size=batch_size,
	resample_method=resample_method,
	show_progress_bar=show_progress_bar,
	ensemble_kwargs=ensemble_kwargs,
	global_pred=current_pred,
	upscale_factor=current_factor,
	)

	# Update predictions
	current_pred = torch.from_numpy(boosted_output.depth_np)

	# Clean up GPU memory
	torch.cuda.empty_cache()

	# Progress to the next upscale factor
	current_factor *= 2

	# Update the progress bar
	pbar.update(1)

	# Return the final output, and attach base depth map
	out = boosted_output
	out.base_depth_np = base_depth_np_upsampled

	return out

	def boosted_inference(
	self,
	input_image: Union[torch.Tensor],
	denoising_steps: int = 10,
	ensemble_size: int = 10,
	processing_res: int = 768,
	match_input_res: bool = True,
	batch_size: int = 0,
	resample_method: str = "bilinear",
	seed: Union[int, None] = None,
	show_progress_bar: bool = True,
	ensemble_kwargs: Dict = None,
	global_pred: torch.Tensor = None,
	upscale_factor: int = 2,
	) -> MarigoldDepthOutput:
	"""
	Function invoked when calling the pipeline with boosted inference.

	Args:
	input_image (`torch.Tensor`):
	Input RGB image.
	denoising_steps (`int`, optional, defaults to `10`):
	Number of diffusion denoising steps (DDIM) during inference.
	ensemble_size (`int`, optional, defaults to `10`):
	Number of predictions to be ensembled.
	processing_res (`int`, optional, defaults to `768`):
	Maximum resolution of processing.
	If set to 0: will not resize at all.
	match_input_res (`bool`, optional, defaults to `True`):
	Resize depth prediction to match input resolution.
	Only valid if `processing_res` > 0.
	resample_method: (`str`, optional, defaults to `bilinear`):
	Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
	batch_size (`int`, optional, defaults to `0`):
	Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size.
	seed (`int`, optional, defaults to `None`):
	Random seed for the diffusion process.
	show_progress_bar (`bool`, optional, defaults to `True`):
	Display a progress bar of diffusion denoising.
	ensemble_kwargs (`dict`, optional, defaults to `None`):
	Arguments for detailed ensembling settings.
	global_pred (`torch.Tensor`):
	Global depth map to be used as guidance.
	upscale_factor (`int`, optional, defaults to `2`):
	Upscale factor of the global depth map.

	Returns:
	`MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
	- depth_np (`np.ndarray`) Predicted depth map with depth values in the range of [0, 1]
	- base_depth_np (`np.ndarray`) Upsampled base depth map with depth values in the range of [0, 1].
	"""
	device = self.device

	self._check_inference_step(denoising_steps)
	resample_method: InterpolationMode = get_tv_resample_method(resample_method)

	# Convert to torch tensor
	if isinstance(input_image, Image.Image):
	input_image = input_image.convert("RGB")
	# convert to torch tensor [H, W, rgb] -> [rgb, H, W]
	input_image = pil_to_tensor(input_image)
	elif isinstance(input_image, torch.Tensor):
	input_image = input_image.squeeze()
	# pass
	else:
	raise TypeError(f"Unknown input type: {type(input_image) = }")
	input_size = input_image.shape
	assert (
	3 == input_image.dim() and 3 == input_size[0]
	), f"Wrong input shape {input_size}, expected [rgb, H, W]"

	if isinstance(global_pred, torch.Tensor):
	global_pred = global_pred.squeeze().unsqueeze(0).to(device)
	else:
	raise TypeError(f"Unknown global_pred type: {type(global_pred) = }")

	if processing_res == 0:
	# fallback to original resolution
	processing_res = 768

	df = min(
	processing_res / input_image.shape[2], processing_res / input_image.shape[1]
	)
	patch_height = int(input_image.shape[1] * df)
	patch_width = int(input_image.shape[2] * df)

	# need to be divisible by 8
	patch_height = round(patch_height / 16) * 16
	patch_width = round(patch_width / 16) * 16

	patch_size = patch_height, patch_width
	global_size = (patch_size[0] * upscale_factor, patch_size[1] * upscale_factor)

	if global_pred.shape[1:] != global_size:
	global_pred = resize(
	global_pred,
	global_size,
	interpolation=resample_method,
	antialias=True,
	)

	if input_image.shape[1:] != patch_size:
	input_image = resize(
	input_image,
	global_size,
	interpolation=resample_method,
	antialias=True,
	).squeeze()

	input_image = (
	input_image.unsqueeze(0) / 255.0 * 2.0 - 1.0
	) # [0, 255] -> [-1, 1]
	input_image = input_image.to(self.dtype).to(device)
	assert input_image.min() >= -1.0 and input_image.max() <= 1.0
	global_pred = global_pred.to(self.dtype).to(device)

	if batch_size > 0:
	_bs = batch_size
	else:
	_bs = find_batch_size(
	ensemble_size=ensemble_size
	* (2 * upscale_factor - 1)
	* (2 * upscale_factor - 1),
	input_res=max(patch_size),
	dtype=self.dtype,
	)

	# create a small buffer in z-dimension
	global_pred = 0.9 * (global_pred * 2 - 1)
	global_pred = (global_pred + 1) / 2

	depth_pred, pred_uncert = self.multidiffusion_inference(
	rgb_norm=input_image,
	global_pred=global_pred,
	num_inference_steps=denoising_steps,
	patch_size=patch_size,
	seed=seed,
	show_pbar=show_progress_bar,
	ensemble_size=ensemble_size,
	batch_size=_bs,
	ensemble_kwargs=ensemble_kwargs,
	)
	depth_pred = depth_pred.squeeze(0)

	# rescale to to [0, 1]
	min_d = torch.min(depth_pred)
	max_d = torch.max(depth_pred)
	depth_pred = (depth_pred - min_d) / (max_d - min_d)

	if depth_pred.shape[1:] != input_size[1:] and match_input_res:
	depth_pred = resize(
	depth_pred.unsqueeze(0),
	input_size[1:],
	interpolation=resample_method,
	antialias=True,
	).squeeze()
	depth_pred = depth_pred.squeeze().cpu().numpy()
	depth_pred = depth_pred.clip(0, 1)

	return MarigoldDepthOutput(
	depth_np=depth_pred,
	)

	def _check_inference_step(self, n_step: int) -> None:
	"""
	Check if denoising step is reasonable
	Args:
	n_step (`int`): denoising steps
	"""
	assert n_step >= 1

	def encode_empty_text(self):
	"""
	Encode text embedding for empty prompt
	"""
	prompt = ""
	text_inputs = self.tokenizer(
	prompt,
	padding="do_not_pad",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
	self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)

	@torch.no_grad()
	def multidiffusion_inference(
	self,
	rgb_norm: torch.Tensor,
	global_pred: torch.Tensor,
	num_inference_steps: int,
	seed: Union[int, None],
	show_pbar: bool,
	patch_size=(512, 768),
	encoder_patch_size=None,
	ensemble_size=1,
	batch_size=1,
	ensemble_kwargs: Dict = None,
	) -> torch.Tensor:
	"""
	Perform an individual depth prediction without ensembling.

	Args:
	rgb_norm (`torch.Tensor`):
	Input RGB image.
	num_inference_steps (`int`):
	Number of diffusion denoisign steps (DDIM) during inference.
	num_patches_vert (`int`):
	Number of vertical patches.
	num_patches_horz (`int`):
	Number of horizontal patches.
	step_height (`int`):
	Height of the patch.
	step_width (`int`):
	Width of the patch.
	Returns:
	`torch.Tensor`: Predicted depth map.
	`torch.Tensor`: Uncertainty map.
	"""
	device = self.device
	rgb_norm = rgb_norm.to(device)
	global_pred = global_pred.to(device)
	# Set timesteps
	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps

	latent_patch_size = (patch_size[0] // 8, patch_size[1] // 8)
	latent_full_size = (rgb_norm.shape[-2] // 8, rgb_norm.shape[-1] // 8)

	# Normalize the global prediction to [-1, 1]
	global_pred = global_pred * 2 - 1

	# Encode rgb and depth map
	if encoder_patch_size is None:
	encoder_patch_size = patch_size
	rgb_latent = self.encode_rgb_patched(
	rgb_norm, patch_size=encoder_patch_size, show_pbar=show_pbar
	)
	global_pred_latent = self.encode_depth_patched(
	global_pred, patch_size=encoder_patch_size, show_pbar=show_pbar
	)

	# offload to cpu for memory efficiency
	rgb_norm = rgb_norm.cpu()
	global_pred = global_pred.cpu()

	# Initial depth map (noise)
	if seed is None:
	rand_num_generator = None
	else:
	rand_num_generator = torch.Generator(device=device)
	rand_num_generator.manual_seed(seed)

	# patch the input images
	(
	rgb_latent_patched,
	num_patches_vert,
	num_patches_horz,
	step_height,
	step_width,
	) = self.extract_patches(rgb_latent[0], patch_size=latent_patch_size)

	# also patch the global depth map
	global_pred_latent_patched, _, _, _, _ = self.extract_patches(
	global_pred_latent[0], patch_size=latent_patch_size
	)

	# Batched empty text embedding
	if self.empty_text_embed is None:
	self.encode_empty_text()
	batch_empty_text_embed = self.empty_text_embed.repeat((batch_size, 1, 1)).to(
	device
	)
	if hasattr(self, "pooled_empty_text_embeds"):
	batch_pooled_empty_text_embed = self.pooled_empty_text_embeds.repeat(
	(batch_size, 1, 1)
	).to(device)

	# enlarge the variable according to the ensemble size
	if ensemble_size > 1:
	rgb_latent_patched = rgb_latent_patched.repeat(ensemble_size, 1, 1, 1)
	global_pred_latent_patched = global_pred_latent_patched.repeat(
	ensemble_size, 1, 1, 1
	)

	batch_empty_text_embed = batch_empty_text_embed.repeat(ensemble_size, 1, 1)
	if hasattr(self, "pooled_empty_text_embeds"):
	batch_pooled_empty_text_embed = batch_pooled_empty_text_embed.repeat(
	ensemble_size, 1, 1
	)

	# Initialize the canvas and split it to get identical noise on overlaps
	depth_latent = torch.randn(
	(ensemble_size, 4, latent_full_size[0], latent_full_size[1]),
	device=device,
	dtype=self.dtype,
	generator=rand_num_generator,
	)
	(
	depth_latent_patched,
	num_patches_vert,
	num_patches_horz,
	step_height,
	step_width,
	) = self.extract_patches(depth_latent, patch_size=latent_patch_size)

	# Denoising loop
	if show_pbar:
	iterable = tqdm(
	enumerate(timesteps),
	total=len(timesteps),
	leave=False,
	desc=" " * 4 + "Diffusion denoising",
	)
	else:
	iterable = enumerate(timesteps)

	for _, t in iterable:
	# 1. inference all the patches with unet
	assert (
	self.boosting_unet.conv_in.in_channels == 12
	), "The input channels of the boosting unet must be 12."
	unet_input = torch.cat(
	[rgb_latent_patched, global_pred_latent_patched, depth_latent_patched],
	dim=1,
	)

	# 2. Create a dataloader and predict the noise
	dataset = TensorDataset(unet_input)
	loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
	noise_preds = []
	for batch in tqdm(
	loader,
	leave=False,
	disable=not show_pbar,
	desc=" " * 6 + "UNet patch inference",
	):
	(unet_input,) = batch
	noise_pred = self.boosting_unet(
	unet_input,
	t,
	encoder_hidden_states=batch_empty_text_embed[: unet_input.shape[0]],
	).sample
	noise_preds.append(noise_pred)
	noise_preds = torch.concat(noise_preds, dim=0)

	# 3. Default ddim scheduler step for each patch
	scheduler_out = self.scheduler.step(
	noise_preds, t, depth_latent_patched, generator=rand_num_generator
	)

	# 4. Reshape patches to patch-spatial dimension
	depth_latent = scheduler_out.prev_sample.reshape(
	ensemble_size,
	num_patches_vert,
	num_patches_horz,
	4,
	latent_patch_size[0],
	latent_patch_size[1],
	)

	# 5. Blend the patches with multidiffusion formula
	depth_latent_full = self.blend_patches(
	depth_latent,
	canvas_size=global_pred_latent.shape[2:],
	num_patches_vert=num_patches_vert,
	num_patches_horz=num_patches_horz,
	step_height=step_height,
	step_width=step_width,
	)

	# 6. Update the depth_latent_patched
	if t != timesteps[-1]:
	depth_latent_patched, _, _, _, _ = self.extract_patches(
	depth_latent_full, patch_size=latent_patch_size
	)

	# if t<=1: at the end of the loop we decode the full latent
	depth = self.decode_depth_patched(
	depth_latent_full=depth_latent_full,
	canvas_size=global_pred.shape[1:],
	latent_decoder_patch_size=(
	encoder_patch_size[0] // 8,
	encoder_patch_size[1] // 8,
	),
	show_pbar=show_pbar,
	ensemble_size=ensemble_size,
	)

	if ensemble_size > 1:
	depth, pred_uncert = ensemble_depth(
	depth,
	scale_invariant=self.scale_invariant,
	shift_invariant=self.shift_invariant,
	**(ensemble_kwargs or {}),
	)
	else:
	depth = (depth + 1.0) / 2.0
	pred_uncert = None

	return depth, pred_uncert

	def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
	"""
	Encode RGB image into latent.

	Args:
	rgb_in (`torch.Tensor`):
	Input RGB image to be encoded.

	Returns:
	`torch.Tensor`: Image latent.
	"""
	if isinstance(self.vae, AutoencoderTiny):
	rgb_latent = self.vae.encoder(rgb_in)
	else:
	h = self.vae.encoder(rgb_in)
	moments = self.vae.quant_conv(h)
	mean, _ = torch.chunk(moments, 2, dim=1)
	rgb_latent = mean * self.latent_scale_factor

	return rgb_latent

	def encode_rgb_patched(
	self, rgb_in: torch.Tensor, patch_size: tuple, show_pbar: bool = False
	) -> torch.Tensor:
	"""
	Encode depth map into latent.

	Args:
	depth_in (`torch.Tensor`):
	Input depth map to be encoded.
	patch_size (`Tuple[int, int]`):
	Size of the patch.

	Returns:
	`torch.Tensor`: Depth latent.
	"""
	device = self.device

	(
	rgb_patched,
	num_patches_vert_pix,
	num_patches_horz_pix,
	step_height_pix,
	step_width_pix,
	) = self.extract_patches(rgb_in.squeeze(0), patch_size=patch_size)
	rgb_in = rgb_in.cpu()
	rgb_patched = rgb_patched.cpu()

	# forward the patches
	rgb_latent_patched = []
	for rgb in tqdm(
	rgb_patched,
	leave=False,
	disable=not show_pbar,
	desc=" " * 4 + "Encoding RGB",
	):
	patch = rgb.unsqueeze(0).to(device)
	rgb_latent_patched.append(self.encode_rgb(patch).cpu())

	# reshape the spatial dimensions
	rgb_latent_patched = torch.concat(rgb_latent_patched, dim=0)
	rgb_latent_patched = rgb_latent_patched.reshape(
	num_patches_vert_pix,
	num_patches_horz_pix,
	4,
	rgb_latent_patched.shape[-2],
	rgb_latent_patched.shape[-1],
	).to(device)

	# blend the patches
	rgb_latent = self.blend_patches(
	rgb_latent_patched,
	canvas_size=(rgb_in.shape[-2] // 8, rgb_in.shape[-1] // 8),
	num_patches_vert=num_patches_vert_pix,
	num_patches_horz=num_patches_horz_pix,
	step_height=step_height_pix // 8,
	step_width=step_width_pix // 8,
	)

	if len(rgb_latent.shape) == 3:
	rgb_latent = rgb_latent.unsqueeze(0)

	return rgb_latent

	def encode_depth_patched(
	self,
	depth_in: torch.Tensor,
	patch_size,
	show_pbar: bool = False,
	) -> torch.Tensor:
	"""
	Encode depth map into latent, but in a patched and scalable way

	Args:
	depth_in (`torch.Tensor`):
	Input depth map to be encoded.
	patch_size (`Tuple[int, int]`):
	Size of the patch.
	show_pbar (`bool`):
	Display a progress bar.

	Returns:
	`torch.Tensor`: Depth latent.
	"""
	device = self.device
	ensemble_size = depth_in.shape[0]

	(
	depth_in_patched,
	num_patches_vert_pix,
	num_patches_horz_pix,
	step_height_pix,
	step_width_pix,
	) = self.extract_patches(depth_in, patch_size=patch_size)
	depth_in = depth_in.cpu()
	depth_in_patched = depth_in_patched.cpu()

	# forward the patches
	depth_latent_patched = []
	for gpred in tqdm(
	depth_in_patched,
	leave=False,
	disable=not show_pbar,
	desc=" " * 4 + "Encoding context depth",
	):
	patch = gpred.unsqueeze(0).to(device)
	depth_latent_patched.append(self.encode_depth(patch).cpu())

	# reshape the spatial dimensions
	depth_latent_patched = torch.concat(depth_latent_patched, dim=0)
	depth_latent_patched = depth_latent_patched.reshape(
	ensemble_size,
	num_patches_vert_pix,
	num_patches_horz_pix,
	4,
	depth_latent_patched.shape[-2],
	depth_latent_patched.shape[-1],
	).to(device)

	# blend the patches
	depth_latent = self.blend_patches(
	depth_latent_patched,
	canvas_size=(depth_in.shape[-2] // 8, depth_in.shape[-1] // 8),
	num_patches_vert=num_patches_vert_pix,
	num_patches_horz=num_patches_horz_pix,
	step_height=step_height_pix // 8,
	step_width=step_width_pix // 8,
	)

	if len(depth_latent.shape) == 3:
	depth_latent = depth_latent.unsqueeze(0)

	return depth_latent

	def decode_depth_patched(
	self,
	depth_latent_full: torch.Tensor,
	canvas_size: tuple,
	latent_decoder_patch_size: tuple,
	show_pbar: bool = True,
	ensemble_size: int = 1,
	) -> torch.Tensor:
	"""
	Decode depth map from latent in a patched and scalable way.

	Args:
	depth_latent_full (`torch.Tensor`):
	Depth latent to be decoded.
	canvas_size (`tuple`):
	Size of the canvas.
	latent_decoder_patch_size (`tuple`):
	Size of the patch.
	show_pbar (`bool`):
	Display a progress bar.
	ensemble_size (`int`):
	Ensemble size.

	Returns:
	`torch.Tensor`: Decoded depth map.
	"""

	encoder_patch_size = (
	latent_decoder_patch_size[0] * 8,
	latent_decoder_patch_size[1] * 8,
	)

	# extract patches
	(
	depth_latent_patched,
	num_patches_vert,
	num_patches_horz,
	step_height,
	step_width,
	) = self.extract_patches(
	depth_latent_full, patch_size=latent_decoder_patch_size, overlap=0.5
	)

	# decode patches
	depthp = []
	for patch in tqdm(
	depth_latent_patched,
	leave=False,
	desc=" " * 6 + "Decoding Depth",
	disable=not show_pbar,
	):
	depthp.append(self.decode_depth(patch.unsqueeze(0)))
	depthp = torch.concat(depthp, dim=0)
	depthp = depthp.reshape(
	ensemble_size,
	num_patches_vert,
	num_patches_horz,
	1,
	encoder_patch_size[0],
	encoder_patch_size[1],
	)

	# blend together
	depth = self.blend_patches(
	depthp,
	canvas_size=canvas_size,
	num_patches_vert=num_patches_vert,
	num_patches_horz=num_patches_horz,
	step_height=step_height * 8,
	step_width=step_width * 8,
	)

	return depth

	def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
	"""
	Decode depth latent into depth map.

	Args:
	depth_latent (`torch.Tensor`):
	Depth latent to be decoded.

	Returns:
	`torch.Tensor`: Decoded depth map of the input depth latent.
	"""

	# if self.using_tiny_vae:
	if isinstance(self.vae, AutoencoderTiny):
	stacked = self.vae.decoder(depth_latent)
	else:
	depth_latent = depth_latent / self.latent_scale_factor
	z = self.vae.post_quant_conv(depth_latent)
	stacked = self.vae.decoder(z)

	# mean of output channels
	depth_mean = stacked.mean(dim=1, keepdim=True)
	return depth_mean

	def encode_depth(
	self,
	depth_in: torch.Tensor,
	) -> torch.Tensor:
	"""
	Encode depth map into latent.

	Args:
	depth_in (`torch.Tensor`):
	Input depth map to be encoded.

	Returns:
	`torch.Tensor`: Depth latent of the input depth map.
	"""

	# stack depth into 3-channel
	stacked = self.stack_depth_images(depth_in)

	# encode using VAE encoder
	depth_latent = self.encode_rgb(stacked)
	return depth_latent

	@staticmethod
	def stack_depth_images(depth_in):
	if 4 == len(depth_in.shape):
	stacked = depth_in.repeat(1, 3, 1, 1)
	elif 3 == len(depth_in.shape):
	stacked = depth_in.unsqueeze(1)
	stacked = depth_in.repeat(1, 3, 1, 1)
	return stacked

	def extract_patches(self, canvas_input_image, patch_size, overlap=0.5):
	"""
	Extract patches from an image
	Args:
	image (`np.ndarray`): Input image of shape [channels, height, width]
	patch_size (`tuple`): Size of the patch
	step_size (`int`): Step size
	Returns:
	input_image_patched (`torch.Tensor`): Extracted patches
	num_patches_vert (`int`): Number of patches in the vertical direction
	num_patches_horz (`int`): Number of patches in the horizontal direction
	step_height (`int`): Step size in the vertical direction
	step_width (`int`): Step size in the horizontal direction
	"""

	if len(canvas_input_image.shape) == 4:
	ensemble_size = canvas_input_image.shape[0]
	else:
	canvas_input_image = canvas_input_image.unsqueeze(0)
	ensemble_size = 1

	# Step sizes (50% (overlap) of the original dimensions)
	h, w = patch_size
	step_height, step_width = int(h * (1 - overlap)), int(w * (1 - overlap))

	# Calculate the number of patches to extract in both dimensions
	num_patches_vert = (canvas_input_image.shape[-2] - h) // step_height + 1
	num_patches_horz = (canvas_input_image.shape[-1] - w) // step_width + 1

	# Initialize a list to hold the patches
	patches = []

	for e in range(ensemble_size):
	for i in range(num_patches_vert):
	for j in range(num_patches_horz):
	# Calculate the top left corner of the current patch
	start_y = i * step_height
	start_x = j * step_width

	# Extract the patch
	patch = canvas_input_image[
	e, :, start_y : start_y + h, start_x : start_x + w
	]
	patches.append(patch)

	# Stack the patches and return
	input_image_patched = torch.stack(patches)

	return (
	input_image_patched,
	num_patches_vert,
	num_patches_horz,
	step_height,
	step_width,
	)

	def blend_patches(
	self,
	depth_preds,
	num_patches_vert,
	num_patches_horz,
	step_height,
	step_width,
	global_depth_pred=None,
	canvas_size=None,
	alpha_center=1.0,
	alpha_edge=1e-4,
	noise_blend=False,
	eps=1e-8,
	):
	"""
	Blend patches of depth maps and apply the transformation to the global depth map

	Args:
	global_depth_pred (`torch.Tensor`): Global depth map
	depth_preds (`torch.Tensor`): Local depth maps
	num_patches_vert (`int`): Number of patches in the vertical direction
	num_patches_horz (`int`): Number of patches in the horizontal direction
	step_height (`int`): Step size in the vertical direction
	step_width (`int`): Step size in the horizontal direction
	overlap (`float`): Overlap between patches
	alpha_center (`float`): Weight at the center of the patch
	alpha_edge (`float`): Weight at the edge of the patch, should not be exactly 0 to avoid division by zero
	adjust_LSQ (`bool`): Adjust the transformation using least squares optimization

	Returns:
	`torch.Tensor`: Blended depth map
	"""

	eps = 1e-8

	channels, h, w = (
	depth_preds.shape[-3],
	depth_preds.shape[-2],
	depth_preds.shape[-1],
	)

	if len(depth_preds.shape) == 6:
	ensemble_size = depth_preds.shape[0] if len(depth_preds.shape) == 6 else 1
	elif len(depth_preds.shape) == 5:
	ensemble_size = 1
	depth_preds = depth_preds.unsqueeze(0)
	else:
	raise ValueError("depth_preds should have 5 or 6 dimensions")

	# Initialize the canvas for blending depth maps
	blended_depth_map = torch.zeros(
	(ensemble_size, channels, canvas_size[0], canvas_size[1]),
	device=depth_preds.device,
	dtype=depth_preds.dtype,
	)
	denominator_map = torch.zeros(
	(1, canvas_size[0], canvas_size[1]),
	device=depth_preds.device,
	dtype=depth_preds.dtype,
	)

	for i in range(num_patches_vert):
	for j in range(num_patches_horz):
	# Calculate the top left corner of the current patch
	start_y = i * step_height
	start_x = j * step_width

	# Extract the patch
	local_patch = depth_preds[:, i, j]

	# Generate blending weights
	weights = self.get_linear_weight_map(
	h,
	w,
	device=local_patch.device,
	alpha_center=alpha_center,
	alpha_edge=alpha_edge,
	cosine_blending=True,
	margin=0.0,
	)

	# accumulate the local patch to the canvas as a linear combination
	blended_depth_map[
	:, :, start_y : start_y + h, start_x : start_x + w
	] += (local_patch * weights)

	if noise_blend:
	denominator_weights = weights**2
	else:
	denominator_weights = weights
	denominator_map[
	:, start_y : start_y + h, start_x : start_x + w
	] += denominator_weights

	if noise_blend:
	denominator_map = torch.sqrt(denominator_map)

	blended_depth_map /= denominator_map + eps

	# Ensure that blended_depth_map does not have NaN values
	# by filling them with the global_depth_pred
	if global_depth_pred is not None:
	blended_depth_map[torch.isnan(blended_depth_map)] = (
	global_depth_pred.repeat(ensemble_size, 1, 1, 1)[
	torch.isnan(blended_depth_map)
	]
	)

	return blended_depth_map

	def get_linear_weight_map(
	self,
	h,
	w,
	device,
	alpha_center=1.0,
	alpha_edge=1e-4,
	margin=0.0,
	cosine_blending=False,
	):
	"""
	Generate a linear weight map for blending patches
	Args:
	h (`int`): Height of the weight map
	w (`int`): Width of the weight map
	device (`torch.device`): Device to use
	alpha_center (`float`): Weight at the center of the patch
	alpha_edge (`float`): Weight at the edge of the patch
	margin (`int`): Perceptage of image dimensions. This margin at the edges to be filled with near 0 values.
	Returns:
	`torch.Tensor`: Linear distance weight map (looks like a pyramid)
	"""

	x = torch.linspace(-1, 1, h, device=device)
	y = torch.linspace(-1, 1, w, device=device)
	xx, yy = torch.meshgrid(x, y, indexing="ij")
	dist = torch.stack([xx.abs(), yy.abs()]).max(dim=0).values
	norm_dist = dist / torch.max(dist)

	# Clamp the distance to the margin
	norm_dist = torch.clamp(norm_dist + margin, 0, 1)

	# scale to 0 to 1
	mindist = torch.min(norm_dist)
	maxdist = torch.max(norm_dist)
	norm_dist = (norm_dist - mindist) / (maxdist - mindist)

	# Apply a cosine-based blending function for smooth transition
	if cosine_blending:
	weights = alpha_edge + (alpha_center - alpha_edge) * (
	0.5 * (1 + torch.cos(norm_dist * math.pi))
	)
	else:
	weights = alpha_edge + (alpha_center - alpha_edge) * (1 - norm_dist)

	return weights





	def get_tv_resample_method(method_str: str) -> InterpolationMode:
	resample_method_dict = {
	"bilinear": InterpolationMode.BILINEAR,
	"bicubic": InterpolationMode.BICUBIC,
	"nearest": InterpolationMode.NEAREST_EXACT,
	"nearest-exact": InterpolationMode.NEAREST_EXACT,
	}
	resample_method = resample_method_dict.get(method_str, None)
	if resample_method is None:
	raise ValueError(f"Unknown resampling method: {resample_method}")
	else:
	return resample_method


	def resize_max_res(
	img: torch.Tensor,
	max_edge_resolution: int,
	resample_method: InterpolationMode = InterpolationMode.BILINEAR,
	) -> torch.Tensor:
	"""
	Resize image to limit maximum edge length while keeping aspect ratio.

	Args:
	img (`torch.Tensor`):
	Image tensor to be resized. Expected shape: [B, C, H, W]
	max_edge_resolution (`int`):
	Maximum edge length (pixel).
	resample_method (`PIL.Image.Resampling`):
	Resampling method used to resize images.

	Returns:
	`torch.Tensor`: Resized image.
	"""
	assert 4 == img.dim(), f"Invalid input shape {img.shape}"

	original_height, original_width = img.shape[-2:]
	downscale_factor = min(
	max_edge_resolution / original_width, max_edge_resolution / original_height
	)

	new_width = int(original_width * downscale_factor)
	new_height = int(original_height * downscale_factor)

	resized_img = resize(img, (new_height, new_width), resample_method, antialias=True)
	return resized_img


	def ensemble_depth(
	depth: torch.Tensor,
	scale_invariant: bool = True,
	shift_invariant: bool = True,
	output_uncertainty: bool = False,
	reduction: str = "median",
	regularizer_strength: float = 0.02,
	max_iter: int = 50,
	tol: float = 1e-6,
	max_res: int = 1024,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""
	Ensembles depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the
	number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for
	depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The
	alignment happens when the predictions have one or more degrees of freedom, that is when they are either
	affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only
	`scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`)
	alignment is skipped and only ensembling is performed.

	Args:
	depth (`torch.Tensor`):
	Input ensemble depth maps.
	scale_invariant (`bool`, optional, defaults to `True`):
	Whether to treat predictions as scale-invariant.
	shift_invariant (`bool`, optional, defaults to `True`):
	Whether to treat predictions as shift-invariant.
	output_uncertainty (`bool`, optional, defaults to `False`):
	Whether to output uncertainty map.
	reduction (`str`, optional, defaults to `"median"`):
	Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and
	`"median"`.
	regularizer_strength (`float`, optional, defaults to `0.02`):
	Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1.
	max_iter (`int`, optional, defaults to `2`):
	Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options`
	argument.
	tol (`float`, optional, defaults to `1e-3`):
	Alignment solver tolerance. The solver stops when the tolerance is reached.
	max_res (`int`, optional, defaults to `1024`):
	Resolution at which the alignment is performed; `None` matches the `processing_resolution`.
	Returns:
	A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape:
	`(1, 1, H, W)`.
	"""
	if depth.dim() != 4 or depth.shape[1] != 1:
	raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.")
	if reduction not in ("mean", "median"):
	raise ValueError(f"Unrecognized reduction method: {reduction}.")
	if not scale_invariant and shift_invariant:
	raise ValueError("Pure shift-invariant ensembling is not supported.")

	def init_param(depth: torch.Tensor):
	init_min = depth.reshape(ensemble_size, -1).min(dim=1).values
	init_max = depth.reshape(ensemble_size, -1).max(dim=1).values

	if scale_invariant and shift_invariant:
	init_s = 1.0 / (init_max - init_min).clamp(min=1e-6)
	init_t = -init_s * init_min
	param = torch.cat((init_s, init_t)).cpu().numpy()
	elif scale_invariant:
	init_s = 1.0 / init_max.clamp(min=1e-6)
	param = init_s.cpu().numpy()
	else:
	raise ValueError("Unrecognized alignment.")

	return param.astype(np.float64)

	def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor:
	if scale_invariant and shift_invariant:
	s, t = np.split(param, 2)
	s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1)
	t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1)
	out = depth * s + t
	elif scale_invariant:
	s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1)
	out = depth * s
	else:
	raise ValueError("Unrecognized alignment.")
	return out

	def ensemble(
	depth_aligned: torch.Tensor, return_uncertainty: bool = False
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	uncertainty = None
	if reduction == "mean":
	prediction = torch.mean(depth_aligned, dim=0, keepdim=True)
	if return_uncertainty:
	uncertainty = torch.std(depth_aligned, dim=0, keepdim=True)
	elif reduction == "median":
	prediction = torch.median(depth_aligned, dim=0, keepdim=True).values
	if return_uncertainty:
	uncertainty = torch.median(
	torch.abs(depth_aligned - prediction), dim=0, keepdim=True
	).values
	else:
	raise ValueError(f"Unrecognized reduction method: {reduction}.")
	return prediction, uncertainty

	def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
	cost = 0.0
	depth_aligned = align(depth, param)

	for i, j in torch.combinations(torch.arange(ensemble_size)):
	diff = depth_aligned[i] - depth_aligned[j]
	cost += (diff**2).mean().sqrt().item()

	if regularizer_strength > 0:
	prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
	err_near = (0.0 - prediction.min()).abs().item()
	err_far = (1.0 - prediction.max()).abs().item()
	cost += (err_near + err_far) * regularizer_strength

	return cost

	def compute_param(depth: torch.Tensor):
	import scipy

	depth_to_align = depth.to(torch.float32)
	if max_res is not None and max(depth_to_align.shape[2:]) > max_res:
	depth_to_align = resize_max_res(
	depth_to_align, max_res, get_tv_resample_method("nearest-exact")
	)

	param = init_param(depth_to_align)

	res = scipy.optimize.minimize(
	partial(cost_fn, depth=depth_to_align),
	param,
	method="BFGS",
	tol=tol,
	options={"maxiter": max_iter, "disp": False},
	)

	return res.x

	requires_aligning = scale_invariant or shift_invariant
	ensemble_size = depth.shape[0]

	if requires_aligning:
	param = compute_param(depth)
	depth = align(depth, param)

	depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty)

	depth_max = depth.max()
	if scale_invariant and shift_invariant:
	depth_min = depth.min()
	elif scale_invariant:
	depth_min = 0
	else:
	raise ValueError("Unrecognized alignment.")
	depth_range = (depth_max - depth_min).clamp(min=1e-6)
	depth = (depth - depth_min) / depth_range
	if output_uncertainty:
	uncertainty /= depth_range

	return depth, uncertainty # [1,1,H,W], [1,1,H,W]


	# Search table for suggested max. inference batch size
	bs_search_table = [
	# tested on A100-PCIE-80GB
	{"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
	{"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
	# tested on A100-PCIE-40GB
	{"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
	{"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
	{"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
	{"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
	# tested on RTX3090, RTX4090
	{"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
	{"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
	{"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
	{"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
	{"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
	{"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
	# tested on GTX1080Ti
	{"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
	{"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
	{"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
	{"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
	{"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
	]


	def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
	"""
	Automatically search for suitable operating batch size.

	Args:
	ensemble_size (`int`):
	Number of predictions to be ensembled.
	input_res (`int`):
	Operating resolution of the input image.

	Returns:
	`int`: Operating batch size.
	"""
	if not torch.cuda.is_available():
	return 1

	total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
	filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
	for settings in sorted(
	filtered_bs_search_table,
	key=lambda k: (k["res"], -k["total_vram"]),
	):
	if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
	bs = settings["bs"]
	if bs > ensemble_size:
	bs = ensemble_size
	elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
	bs = math.ceil(ensemble_size / 2)
	return bs

	return 1