fantos commited on
Commit
0bf993c
·
verified ·
1 Parent(s): 595a73a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -49
app.py CHANGED
@@ -5,13 +5,13 @@ import cv2
5
  import gradio as gr
6
  import numpy as np
7
  from huggingface_hub import snapshot_download
8
- from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
9
  from diffusers.utils import load_image
10
  from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
11
  from kolors.models.modeling_chatglm import ChatGLMModel
12
  from kolors.models.tokenization_chatglm import ChatGLMTokenizer
13
  from kolors.models.controlnet import ControlNetModel
14
- from diffusers import AutoencoderKL
15
  from kolors.models.unet_2d_condition import UNet2DConditionModel
16
  from diffusers import EulerDiscreteScheduler
17
  from PIL import Image
@@ -19,13 +19,15 @@ from annotator.midas import MidasDetector
19
  from annotator.dwpose import DWposeDetector
20
  from annotator.util import resize_image, HWC3
21
 
22
-
23
  device = "cuda"
24
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
25
  ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
26
  ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
27
  ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")
28
 
 
 
 
29
  text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
30
  tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
31
  vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
@@ -37,7 +39,7 @@ controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=N
37
 
38
  pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
39
  vae=vae,
40
- controlnet = controlnet_depth,
41
  text_encoder=text_encoder,
42
  tokenizer=tokenizer,
43
  unet=unet,
@@ -47,7 +49,7 @@ pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
47
 
48
  pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
49
  vae=vae,
50
- controlnet = controlnet_canny,
51
  text_encoder=text_encoder,
52
  tokenizer=tokenizer,
53
  unet=unet,
@@ -57,7 +59,7 @@ pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
57
 
58
  pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
59
  vae=vae,
60
- controlnet = controlnet_pose,
61
  text_encoder=text_encoder,
62
  tokenizer=tokenizer,
63
  unet=unet,
@@ -65,6 +67,13 @@ pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
65
  force_zeros_for_empty_prompt=False
66
  )
67
 
 
 
 
 
 
 
 
68
  @spaces.GPU
69
  def process_canny_condition(image, canny_threods=[100,200]):
70
  np_image = image.copy()
@@ -90,7 +99,7 @@ def process_dwpose_condition(image, res=1024):
90
  img = resize_image(HWC3(image), res)
91
  out_res, out_img = model_dwpose(image)
92
  result = HWC3(out_img)
93
- result = cv2.resize( result, (w,h) )
94
  return Image.fromarray(result)
95
 
96
  MAX_SEED = np.iinfo(np.int32).max
@@ -99,7 +108,7 @@ MAX_IMAGE_SIZE = 1024
99
  @spaces.GPU
100
  def infer_depth(prompt,
101
  image = None,
102
- negative_prompt = "nsfw,脸部阴影,低分辨率,jpeg伪影、模糊、糟糕,黑脸,霓虹灯",
103
  seed = 397886929,
104
  randomize_seed = False,
105
  guidance_scale = 6.0,
@@ -108,22 +117,25 @@ def infer_depth(prompt,
108
  control_guidance_end = 0.9,
109
  strength = 1.0
110
  ):
 
 
 
111
  if randomize_seed:
112
  seed = random.randint(0, MAX_SEED)
113
  generator = torch.Generator().manual_seed(seed)
114
- init_image = resize_image(image, MAX_IMAGE_SIZE)
115
  pipe = pipe_depth.to("cuda")
116
- condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
117
  image = pipe(
118
- prompt= prompt ,
119
- image = init_image,
120
- controlnet_conditioning_scale = controlnet_conditioning_scale,
121
- control_guidance_end = control_guidance_end,
122
- strength= strength ,
123
- control_image = condi_img,
124
- negative_prompt= negative_prompt ,
125
- num_inference_steps= num_inference_steps,
126
- guidance_scale= guidance_scale,
127
  num_images_per_prompt=1,
128
  generator=generator,
129
  ).images[0]
@@ -132,7 +144,7 @@ def infer_depth(prompt,
132
  @spaces.GPU
133
  def infer_canny(prompt,
134
  image = None,
135
- negative_prompt = "nsfw,脸部阴影,低分辨率,jpeg伪影、模糊、糟糕,黑脸,霓虹灯",
136
  seed = 397886929,
137
  randomize_seed = False,
138
  guidance_scale = 6.0,
@@ -141,22 +153,25 @@ def infer_canny(prompt,
141
  control_guidance_end = 0.9,
142
  strength = 1.0
143
  ):
 
 
 
144
  if randomize_seed:
145
  seed = random.randint(0, MAX_SEED)
146
  generator = torch.Generator().manual_seed(seed)
147
- init_image = resize_image(image, MAX_IMAGE_SIZE)
148
  pipe = pipe_canny.to("cuda")
149
  condi_img = process_canny_condition(np.array(init_image))
150
  image = pipe(
151
- prompt= prompt ,
152
- image = init_image,
153
- controlnet_conditioning_scale = controlnet_conditioning_scale,
154
- control_guidance_end = control_guidance_end,
155
- strength= strength ,
156
- control_image = condi_img,
157
- negative_prompt= negative_prompt ,
158
- num_inference_steps= num_inference_steps,
159
- guidance_scale= guidance_scale,
160
  num_images_per_prompt=1,
161
  generator=generator,
162
  ).images[0]
@@ -165,7 +180,7 @@ def infer_canny(prompt,
165
  @spaces.GPU
166
  def infer_pose(prompt,
167
  image = None,
168
- negative_prompt = "nsfw,脸部阴影,低分辨率,jpeg伪影、模糊、糟糕,黑脸,霓虹灯",
169
  seed = 66,
170
  randomize_seed = False,
171
  guidance_scale = 6.0,
@@ -174,45 +189,48 @@ def infer_pose(prompt,
174
  control_guidance_end = 0.9,
175
  strength = 1.0
176
  ):
 
 
 
177
  if randomize_seed:
178
  seed = random.randint(0, MAX_SEED)
179
  generator = torch.Generator().manual_seed(seed)
180
- init_image = resize_image(image, MAX_IMAGE_SIZE)
181
  pipe = pipe_pose.to("cuda")
182
  condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
183
  image = pipe(
184
- prompt= prompt ,
185
- image = init_image,
186
- controlnet_conditioning_scale = controlnet_conditioning_scale,
187
- control_guidance_end = control_guidance_end,
188
- strength= strength ,
189
- control_image = condi_img,
190
- negative_prompt= negative_prompt ,
191
- num_inference_steps= num_inference_steps,
192
- guidance_scale= guidance_scale,
193
  num_images_per_prompt=1,
194
  generator=generator,
195
  ).images[0]
196
  return [condi_img, image], seed
197
 
198
  canny_examples = [
199
- ["一个漂亮的女孩,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K",
200
  "image/woman_1.png"],
201
- ["全景,一只可爱的白色小狗坐在杯子里,看向镜头,动漫风格,3d渲染,辛烷值渲染",
202
  "image/dog.png"]
203
  ]
204
 
205
  depth_examples = [
206
- ["新海诚风格,丰富的色彩,穿着绿色衬衫的女人站在田野里,唯美风景,清新明亮,斑驳的光影,最好的质量,超细节,8K画质",
207
  "image/woman_2.png"],
208
- ["一只颜色鲜艳的小鸟,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K",
209
  "image/bird.png"]
210
  ]
211
 
212
  pose_examples = [
213
- ["一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩双手托脸,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K",
214
  "image/woman_3.png"],
215
- ["一个穿着黑色运动外套、白色内搭,上面戴着项链的女子,站在街边,背景是红色建筑和绿树,高品质,超清晰,色彩鲜艳,超高分辨率,最佳品质,8k,高清,4K",
216
  "image/woman_4.png"]
217
  ]
218
 
@@ -252,7 +270,7 @@ with gr.Blocks(css=css) as Kolors:
252
  label="Negative prompt",
253
  placeholder="Enter a negative prompt",
254
  visible=True,
255
- value="nsfw,脸部阴影,低分辨率,jpeg伪影、模糊、糟糕,黑脸,霓虹灯"
256
  )
257
  seed = gr.Slider(
258
  label="Seed",
@@ -353,4 +371,4 @@ with gr.Blocks(css=css) as Kolors:
353
  outputs = [result, seed_used]
354
  )
355
 
356
- Kolors.queue().launch(debug=True)
 
5
  import gradio as gr
6
  import numpy as np
7
  from huggingface_hub import snapshot_download
8
+ from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, pipeline
9
  from diffusers.utils import load_image
10
  from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
11
  from kolors.models.modeling_chatglm import ChatGLMModel
12
  from kolors.models.tokenization_chatglm import ChatGLMTokenizer
13
  from kolors.models.controlnet import ControlNetModel
14
+ from diffusers import AutoencoderKL
15
  from kolors.models.unet_2d_condition import UNet2DConditionModel
16
  from diffusers import EulerDiscreteScheduler
17
  from PIL import Image
 
19
  from annotator.dwpose import DWposeDetector
20
  from annotator.util import resize_image, HWC3
21
 
 
22
  device = "cuda"
23
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
24
  ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
25
  ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
26
  ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")
27
 
28
+ # Add translation pipeline
29
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
30
+
31
  text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
32
  tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
33
  vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 
39
 
40
  pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
41
  vae=vae,
42
+ controlnet=controlnet_depth,
43
  text_encoder=text_encoder,
44
  tokenizer=tokenizer,
45
  unet=unet,
 
49
 
50
  pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
51
  vae=vae,
52
+ controlnet=controlnet_canny,
53
  text_encoder=text_encoder,
54
  tokenizer=tokenizer,
55
  unet=unet,
 
59
 
60
  pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
61
  vae=vae,
62
+ controlnet=controlnet_pose,
63
  text_encoder=text_encoder,
64
  tokenizer=tokenizer,
65
  unet=unet,
 
67
  force_zeros_for_empty_prompt=False
68
  )
69
 
70
+ @spaces.GPU
71
+ def translate_korean_to_english(text):
72
+ if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text): # Check if Korean characters are present
73
+ translated = translator(text, max_length=512)[0]['translation_text']
74
+ return translated
75
+ return text
76
+
77
  @spaces.GPU
78
  def process_canny_condition(image, canny_threods=[100,200]):
79
  np_image = image.copy()
 
99
  img = resize_image(HWC3(image), res)
100
  out_res, out_img = model_dwpose(image)
101
  result = HWC3(out_img)
102
+ result = cv2.resize(result, (w,h))
103
  return Image.fromarray(result)
104
 
105
  MAX_SEED = np.iinfo(np.int32).max
 
108
  @spaces.GPU
109
  def infer_depth(prompt,
110
  image = None,
111
+ negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
112
  seed = 397886929,
113
  randomize_seed = False,
114
  guidance_scale = 6.0,
 
117
  control_guidance_end = 0.9,
118
  strength = 1.0
119
  ):
120
+ prompt = translate_korean_to_english(prompt)
121
+ negative_prompt = translate_korean_to_english(negative_prompt)
122
+
123
  if randomize_seed:
124
  seed = random.randint(0, MAX_SEED)
125
  generator = torch.Generator().manual_seed(seed)
126
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
127
  pipe = pipe_depth.to("cuda")
128
+ condi_img = process_depth_condition_midas(np.array(init_image), MAX_IMAGE_SIZE)
129
  image = pipe(
130
+ prompt=prompt,
131
+ image=init_image,
132
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
133
+ control_guidance_end=control_guidance_end,
134
+ strength=strength,
135
+ control_image=condi_img,
136
+ negative_prompt=negative_prompt,
137
+ num_inference_steps=num_inference_steps,
138
+ guidance_scale=guidance_scale,
139
  num_images_per_prompt=1,
140
  generator=generator,
141
  ).images[0]
 
144
  @spaces.GPU
145
  def infer_canny(prompt,
146
  image = None,
147
+ negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
148
  seed = 397886929,
149
  randomize_seed = False,
150
  guidance_scale = 6.0,
 
153
  control_guidance_end = 0.9,
154
  strength = 1.0
155
  ):
156
+ prompt = translate_korean_to_english(prompt)
157
+ negative_prompt = translate_korean_to_english(negative_prompt)
158
+
159
  if randomize_seed:
160
  seed = random.randint(0, MAX_SEED)
161
  generator = torch.Generator().manual_seed(seed)
162
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
163
  pipe = pipe_canny.to("cuda")
164
  condi_img = process_canny_condition(np.array(init_image))
165
  image = pipe(
166
+ prompt=prompt,
167
+ image=init_image,
168
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
169
+ control_guidance_end=control_guidance_end,
170
+ strength=strength,
171
+ control_image=condi_img,
172
+ negative_prompt=negative_prompt,
173
+ num_inference_steps=num_inference_steps,
174
+ guidance_scale=guidance_scale,
175
  num_images_per_prompt=1,
176
  generator=generator,
177
  ).images[0]
 
180
  @spaces.GPU
181
  def infer_pose(prompt,
182
  image = None,
183
+ negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
184
  seed = 66,
185
  randomize_seed = False,
186
  guidance_scale = 6.0,
 
189
  control_guidance_end = 0.9,
190
  strength = 1.0
191
  ):
192
+ prompt = translate_korean_to_english(prompt)
193
+ negative_prompt = translate_korean_to_english(negative_prompt)
194
+
195
  if randomize_seed:
196
  seed = random.randint(0, MAX_SEED)
197
  generator = torch.Generator().manual_seed(seed)
198
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
199
  pipe = pipe_pose.to("cuda")
200
  condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
201
  image = pipe(
202
+ prompt=prompt,
203
+ image=init_image,
204
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
205
+ control_guidance_end=control_guidance_end,
206
+ strength=strength,
207
+ control_image=condi_img,
208
+ negative_prompt=negative_prompt,
209
+ num_inference_steps=num_inference_steps,
210
+ guidance_scale=guidance_scale,
211
  num_images_per_prompt=1,
212
  generator=generator,
213
  ).images[0]
214
  return [condi_img, image], seed
215
 
216
  canny_examples = [
217
+ ["아름다운 소녀, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
218
  "image/woman_1.png"],
219
+ ["전경, 귀여운 흰 강아지가 컵에 앉아 카메라를 보고 있다, 애니메이션 스타일, 3D 렌더링",
220
  "image/dog.png"]
221
  ]
222
 
223
  depth_examples = [
224
+ ["신카이 마코토 스타일, 풍부한 색감, 녹색 셔츠를 입은 여성이 들판에 서 있다, 아름다운 풍경, 상쾌하고 밝은, 반짝이는 빛, 최고의 품질, 초세밀, 8K 화질",
225
  "image/woman_2.png"],
226
+ ["화려한 색상의 작은 새, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
227
  "image/bird.png"]
228
  ]
229
 
230
  pose_examples = [
231
+ ["보라색 퍼프 소매 드레스를 입고 왕관과 흰색 레이스 장갑을 낀 소녀가 양 손으로 얼굴을 감싸고 있다, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
232
  "image/woman_3.png"],
233
+ ["검은색 스포츠 재킷과 흰색 이너를 입고 목걸이를 한 여성이 거리에 서 있다, 배경에는 빨간 건물과 녹색 나무가 있다, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
234
  "image/woman_4.png"]
235
  ]
236
 
 
270
  label="Negative prompt",
271
  placeholder="Enter a negative prompt",
272
  visible=True,
273
+ value="nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights"
274
  )
275
  seed = gr.Slider(
276
  label="Seed",
 
371
  outputs = [result, seed_used]
372
  )
373
 
374
+ Kolors.queue().launch(debug=True)