Phr00t commited on
Commit
5b1c304
·
verified ·
1 Parent(s): 16ef190

Upload nodes_qwen.py

Browse files
Files changed (1) hide show
  1. fixed-textencode-node/nodes_qwen.py +118 -0
fixed-textencode-node/nodes_qwen.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import node_helpers
2
+ import comfy.utils
3
+ import math
4
+ from typing_extensions import override
5
+ from comfy_api.latest import ComfyExtension, io
6
+
7
+ class TextEncodeQwenImageEdit(io.ComfyNode):
8
+ @classmethod
9
+ def define_schema(cls):
10
+ return io.Schema(
11
+ node_id="TextEncodeQwenImageEdit",
12
+ category="advanced/conditioning",
13
+ inputs=[
14
+ io.Clip.Input("clip"),
15
+ io.String.Input("prompt", multiline=True, dynamic_prompts=True),
16
+ io.Vae.Input("vae", optional=True),
17
+ io.Image.Input("image", optional=True),
18
+ ],
19
+ outputs=[
20
+ io.Conditioning.Output(),
21
+ ],
22
+ )
23
+
24
+ @classmethod
25
+ def execute(cls, clip, prompt, vae=None, image=None) -> io.NodeOutput:
26
+ ref_latent = None
27
+ if image is None:
28
+ images = []
29
+ else:
30
+ samples = image.movedim(-1, 1)
31
+ total = int(1024 * 1024)
32
+
33
+ scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
34
+ width = round(samples.shape[3] * scale_by)
35
+ height = round(samples.shape[2] * scale_by)
36
+
37
+ s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
38
+ image = s.movedim(1, -1)
39
+ images = [image[:, :, :, :3]]
40
+ if vae is not None:
41
+ ref_latent = vae.encode(image[:, :, :, :3])
42
+
43
+ tokens = clip.tokenize(prompt, images=images)
44
+ conditioning = clip.encode_from_tokens_scheduled(tokens)
45
+ if ref_latent is not None:
46
+ conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True)
47
+ return io.NodeOutput(conditioning)
48
+
49
+
50
+ class TextEncodeQwenImageEditPlus(io.ComfyNode):
51
+ @classmethod
52
+ def define_schema(cls):
53
+ return io.Schema(
54
+ node_id="TextEncodeQwenImageEditPlus",
55
+ category="advanced/conditioning",
56
+ inputs=[
57
+ io.Clip.Input("clip"),
58
+ io.String.Input("prompt", multiline=True, dynamic_prompts=True),
59
+ io.Vae.Input("vae", optional=True),
60
+ io.Image.Input("image1", optional=True),
61
+ io.Image.Input("image2", optional=True),
62
+ io.Image.Input("image3", optional=True),
63
+ io.Image.Input("image4", optional=True),
64
+ io.Int.Input("target_size", optional=True, default=896, min=128, max=2048, step=32),
65
+ ],
66
+ outputs=[
67
+ io.Conditioning.Output(),
68
+ ],
69
+ )
70
+
71
+ @classmethod
72
+ def execute(cls, clip, prompt, vae=None, image1=None, image2=None, image3=None, image4=None, target_size=896) -> io.NodeOutput:
73
+ ref_latents = []
74
+ images = [image1, image2, image3, image4]
75
+ images_vl = []
76
+ llama_template = "<|im_start|>system\nDescribe key details of the input image (including any objects, characters, poses, facial features, clothing, setting, textures and style), then explain how the user's text instruction should alter, modify or recreate the image. Generate a new image that meets the user's requirements, which can vary from a small change to a completely new image using inputs as a guide.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
77
+ image_prompt = ""
78
+
79
+ for i, image in enumerate(images):
80
+ if image is not None:
81
+ samples = image.movedim(-1, 1)
82
+ total = int(384 * 384)
83
+
84
+ scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
85
+ width = round(samples.shape[3] * scale_by)
86
+ height = round(samples.shape[2] * scale_by)
87
+
88
+ s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
89
+ images_vl.append(s.movedim(1, -1))
90
+ if vae is not None:
91
+ total = int(target_size * target_size)
92
+ scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
93
+
94
+ height = int(samples.shape[2] * scale_by / 32) * 32
95
+ width = int(samples.shape[3] * scale_by / 32) * 32
96
+
97
+ s = comfy.utils.common_upscale(samples, width, height, "lanczos", "center")
98
+ ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
99
+
100
+ image_prompt += "Picture {}: <|vision_start|><|image_pad|><|vision_end|>".format(i + 1)
101
+
102
+ tokens = clip.tokenize(image_prompt + prompt, images=images_vl, llama_template=llama_template)
103
+ conditioning = clip.encode_from_tokens_scheduled(tokens)
104
+ if len(ref_latents) > 0:
105
+ conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True)
106
+ return io.NodeOutput(conditioning)
107
+
108
+ class QwenExtension(ComfyExtension):
109
+ @override
110
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
111
+ return [
112
+ TextEncodeQwenImageEdit,
113
+ TextEncodeQwenImageEditPlus,
114
+ ]
115
+
116
+
117
+ async def comfy_entrypoint() -> QwenExtension:
118
+ return QwenExtension()