Spaces:
Runtime error
Runtime error
import nodes | |
import node_helpers | |
import torch | |
import comfy.model_management | |
class CLIPTextEncodeHunyuanDiT: | |
def INPUT_TYPES(s): | |
return {"required": { | |
"clip": ("CLIP", ), | |
"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), | |
"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), | |
}} | |
RETURN_TYPES = ("CONDITIONING",) | |
FUNCTION = "encode" | |
CATEGORY = "advanced/conditioning" | |
def encode(self, clip, bert, mt5xl): | |
tokens = clip.tokenize(bert) | |
tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] | |
return (clip.encode_from_tokens_scheduled(tokens), ) | |
class EmptyHunyuanLatentVideo: | |
def INPUT_TYPES(s): | |
return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), | |
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} | |
RETURN_TYPES = ("LATENT",) | |
FUNCTION = "generate" | |
CATEGORY = "latent/video" | |
def generate(self, width, height, length, batch_size=1): | |
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) | |
return ({"samples":latent}, ) | |
PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( | |
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: " | |
"1. The main content and theme of the video." | |
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." | |
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." | |
"4. background environment, light, style and atmosphere." | |
"5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" | |
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" | |
"<|start_header_id|>assistant<|end_header_id|>\n\n" | |
) | |
class TextEncodeHunyuanVideo_ImageToVideo: | |
def INPUT_TYPES(s): | |
return {"required": { | |
"clip": ("CLIP", ), | |
"clip_vision_output": ("CLIP_VISION_OUTPUT", ), | |
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), | |
"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), | |
}} | |
RETURN_TYPES = ("CONDITIONING",) | |
FUNCTION = "encode" | |
CATEGORY = "advanced/conditioning" | |
def encode(self, clip, clip_vision_output, prompt, image_interleave): | |
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) | |
return (clip.encode_from_tokens_scheduled(tokens), ) | |
class HunyuanImageToVideo: | |
def INPUT_TYPES(s): | |
return {"required": {"positive": ("CONDITIONING", ), | |
"vae": ("VAE", ), | |
"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), | |
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), | |
"guidance_type": (["v1 (concat)", "v2 (replace)"], ) | |
}, | |
"optional": {"start_image": ("IMAGE", ), | |
}} | |
RETURN_TYPES = ("CONDITIONING", "LATENT") | |
RETURN_NAMES = ("positive", "latent") | |
FUNCTION = "encode" | |
CATEGORY = "conditioning/video_models" | |
def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None): | |
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) | |
out_latent = {} | |
if start_image is not None: | |
start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) | |
concat_latent_image = vae.encode(start_image) | |
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) | |
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 | |
if guidance_type == "v1 (concat)": | |
cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask} | |
else: | |
cond = {'guiding_frame_index': 0} | |
latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image | |
out_latent["noise_mask"] = mask | |
positive = node_helpers.conditioning_set_values(positive, cond) | |
out_latent["samples"] = latent | |
return (positive, out_latent) | |
NODE_CLASS_MAPPINGS = { | |
"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, | |
"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, | |
"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, | |
"HunyuanImageToVideo": HunyuanImageToVideo, | |
} | |