ViewCrafter / utils /diffusion_utils.py
Drexubery's picture
update
df13f4b
import importlib
import numpy as np
import cv2
import torch
import torch.distributed as dist
from collections import OrderedDict
import os
from lvdm.models.samplers.ddim import DDIMSampler
from lvdm.models.samplers.ddim_multiplecond import DDIMSampler as DDIMSampler_multicond
from einops import rearrange, repeat
def count_params(model, verbose=False):
total_params = sum(p.numel() for p in model.parameters())
if verbose:
print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
return total_params
def check_istarget(name, para_list):
"""
name: full name of source para
para_list: partial name of target para
"""
istarget=False
for para in para_list:
if para in name:
return True
return istarget
def instantiate_from_config(config):
if not "target" in config:
if config == '__is_first_stage__':
return None
elif config == "__is_unconditional__":
return None
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
def get_obj_from_str(string, reload=False):
module, cls = string.rsplit(".", 1)
if reload:
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
def load_npz_from_dir(data_dir):
data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)]
data = np.concatenate(data, axis=0)
return data
def load_npz_from_paths(data_paths):
data = [np.load(data_path)['arr_0'] for data_path in data_paths]
data = np.concatenate(data, axis=0)
return data
def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
h, w = image.shape[:2]
if resize_short_edge is not None:
k = resize_short_edge / min(h, w)
else:
k = max_resolution / (h * w)
k = k**0.5
h = int(np.round(h * k / 64)) * 64
w = int(np.round(w * k / 64)) * 64
image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
return image
def setup_dist(args):
if dist.is_initialized():
return
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
'nccl',
init_method='env://'
)
def load_model_checkpoint(model, ckpt):
state_dict = torch.load(ckpt, map_location="cpu")
if "state_dict" in list(state_dict.keys()):
state_dict = state_dict["state_dict"]
try:
model.load_state_dict(state_dict, strict=True)
except:
## rename the keys for 256x256 model
new_pl_sd = OrderedDict()
for k,v in state_dict.items():
new_pl_sd[k] = v
for k in list(new_pl_sd.keys()):
if "framestride_embed" in k:
new_key = k.replace("framestride_embed", "fps_embedding")
new_pl_sd[new_key] = new_pl_sd[k]
del new_pl_sd[k]
model.load_state_dict(new_pl_sd, strict=True)
else:
# deepspeed
new_pl_sd = OrderedDict()
for key in state_dict['module'].keys():
new_pl_sd[key[16:]]=state_dict['module'][key]
model.load_state_dict(new_pl_sd)
print('>>> model checkpoint loaded.')
return model
def get_latent_z(model, videos):
b, c, t, h, w = videos.shape
x = rearrange(videos, 'b c t h w -> (b t) c h w')
z = model.encode_first_stage(x)
z = rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t)
return z
def image_guided_synthesis(model, prompts, videos, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \
unconditional_guidance_scale=1.0, cfg_img=None, fs=None, text_input=False, multiple_cond_cfg=False, timestep_spacing='uniform', guidance_rescale=0.0, condition_index=None, **kwargs):
ddim_sampler = DDIMSampler(model) if not multiple_cond_cfg else DDIMSampler_multicond(model)
batch_size = noise_shape[0]
fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=model.device)
if not text_input:
prompts = [""]*batch_size
assert condition_index is not None, "Error: condition index is None!"
img = videos[:,:,condition_index[0]] #bchw
img_emb = model.embedder(img) ## blc
img_emb = model.image_proj_model(img_emb)
cond_emb = model.get_learned_conditioning(prompts)
cond = {"c_crossattn": [torch.cat([cond_emb,img_emb], dim=1)]}
if model.model.conditioning_key == 'hybrid':
z = get_latent_z(model, videos) # b c t h w
# if loop or interp:
# img_cat_cond = torch.zeros_like(z)
# img_cat_cond[:,:,0,:,:] = z[:,:,0,:,:]
# img_cat_cond[:,:,-1,:,:] = z[:,:,-1,:,:]
# else:
img_cat_cond = z
cond["c_concat"] = [img_cat_cond] # b c 1 h w
if unconditional_guidance_scale != 1.0:
if model.uncond_type == "empty_seq":
prompts = batch_size * [""]
uc_emb = model.get_learned_conditioning(prompts)
elif model.uncond_type == "zero_embed":
uc_emb = torch.zeros_like(cond_emb)
uc_img_emb = model.embedder(torch.zeros_like(img)) ## b l c
uc_img_emb = model.image_proj_model(uc_img_emb)
uc = {"c_crossattn": [torch.cat([uc_emb,uc_img_emb],dim=1)]}
if model.model.conditioning_key == 'hybrid':
uc["c_concat"] = [img_cat_cond]
else:
uc = None
## we need one more unconditioning image=yes, text=""
if multiple_cond_cfg and cfg_img != 1.0:
uc_2 = {"c_crossattn": [torch.cat([uc_emb,img_emb],dim=1)]}
if model.model.conditioning_key == 'hybrid':
uc_2["c_concat"] = [img_cat_cond]
kwargs.update({"unconditional_conditioning_img_nonetext": uc_2})
else:
kwargs.update({"unconditional_conditioning_img_nonetext": None})
z0 = None
cond_mask = None
batch_variants = []
for _ in range(n_samples):
if z0 is not None:
cond_z0 = z0.clone()
kwargs.update({"clean_cond": True})
else:
cond_z0 = None
if ddim_sampler is not None:
samples, _ = ddim_sampler.sample(S=ddim_steps,
conditioning=cond,
batch_size=batch_size,
shape=noise_shape[1:],
verbose=False,
unconditional_guidance_scale=unconditional_guidance_scale,
unconditional_conditioning=uc,
eta=ddim_eta,
cfg_img=cfg_img,
mask=cond_mask,
x0=cond_z0,
fs=fs,
timestep_spacing=timestep_spacing,
guidance_rescale=guidance_rescale,
**kwargs
)
## reconstruct from latent to pixel space
batch_images = model.decode_first_stage(samples)
batch_variants.append(batch_images)
## variants, batch, c, t, h, w
batch_variants = torch.stack(batch_variants)
return batch_variants.permute(1, 0, 2, 3, 4, 5)