inoculatemedia commited on
Commit
01f8320
·
verified ·
1 Parent(s): d2df1ed

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +390 -0
app.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import traceback
10
+ import einops
11
+ import safetensors.torch as sf
12
+ import numpy as np
13
+ import argparse
14
+ import math
15
+
16
+ from PIL import Image
17
+ from diffusers import AutoencoderKLHunyuanVideo
18
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24
+ from diffusers_helper.thread_utils import AsyncStream, async_run
25
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
+ from transformers import SiglipImageProcessor, SiglipVisionModel
27
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
28
+ from diffusers_helper.bucket_tools import find_nearest_bucket
29
+
30
+
31
+ parser = argparse.ArgumentParser()
32
+ parser.add_argument('--share', action='store_true')
33
+ parser.add_argument("--server", type=str, default='0.0.0.0')
34
+ parser.add_argument("--port", type=int, required=False)
35
+ parser.add_argument("--inbrowser", action='store_true')
36
+ args = parser.parse_args()
37
+
38
+ # for win desktop probably use --server 127.0.0.1 --inbrowser
39
+ # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
40
+
41
+ print(args)
42
+
43
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
44
+ high_vram = free_mem_gb > 60
45
+
46
+ print(f'Free VRAM {free_mem_gb} GB')
47
+ print(f'High-VRAM Mode: {high_vram}')
48
+
49
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
50
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
51
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
52
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
53
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
54
+
55
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
56
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
57
+
58
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
59
+
60
+ vae.eval()
61
+ text_encoder.eval()
62
+ text_encoder_2.eval()
63
+ image_encoder.eval()
64
+ transformer.eval()
65
+
66
+ if not high_vram:
67
+ vae.enable_slicing()
68
+ vae.enable_tiling()
69
+
70
+ transformer.high_quality_fp32_output_for_inference = True
71
+ print('transformer.high_quality_fp32_output_for_inference = True')
72
+
73
+ transformer.to(dtype=torch.bfloat16)
74
+ vae.to(dtype=torch.float16)
75
+ image_encoder.to(dtype=torch.float16)
76
+ text_encoder.to(dtype=torch.float16)
77
+ text_encoder_2.to(dtype=torch.float16)
78
+
79
+ vae.requires_grad_(False)
80
+ text_encoder.requires_grad_(False)
81
+ text_encoder_2.requires_grad_(False)
82
+ image_encoder.requires_grad_(False)
83
+ transformer.requires_grad_(False)
84
+
85
+ if not high_vram:
86
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
87
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
88
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
89
+ else:
90
+ text_encoder.to(gpu)
91
+ text_encoder_2.to(gpu)
92
+ image_encoder.to(gpu)
93
+ vae.to(gpu)
94
+ transformer.to(gpu)
95
+
96
+ stream = AsyncStream()
97
+
98
+ outputs_folder = './outputs/'
99
+ os.makedirs(outputs_folder, exist_ok=True)
100
+
101
+
102
+ @torch.no_grad()
103
+ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
104
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
105
+ total_latent_sections = int(max(round(total_latent_sections), 1))
106
+
107
+ job_id = generate_timestamp()
108
+
109
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
110
+
111
+ try:
112
+ # Clean GPU
113
+ if not high_vram:
114
+ unload_complete_models(
115
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
116
+ )
117
+
118
+ # Text encoding
119
+
120
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
121
+
122
+ if not high_vram:
123
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
124
+ load_model_as_complete(text_encoder_2, target_device=gpu)
125
+
126
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
127
+
128
+ if cfg == 1:
129
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
130
+ else:
131
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
132
+
133
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
134
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
135
+
136
+ # Processing input image
137
+
138
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
139
+
140
+ H, W, C = input_image.shape
141
+ height, width = find_nearest_bucket(H, W, resolution=640)
142
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
143
+
144
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
145
+
146
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
147
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
148
+
149
+ # VAE encoding
150
+
151
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
152
+
153
+ if not high_vram:
154
+ load_model_as_complete(vae, target_device=gpu)
155
+
156
+ start_latent = vae_encode(input_image_pt, vae)
157
+
158
+ # CLIP Vision
159
+
160
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
161
+
162
+ if not high_vram:
163
+ load_model_as_complete(image_encoder, target_device=gpu)
164
+
165
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
166
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
167
+
168
+ # Dtype
169
+
170
+ llama_vec = llama_vec.to(transformer.dtype)
171
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
172
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
173
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
174
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
175
+
176
+ # Sampling
177
+
178
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
179
+
180
+ rnd = torch.Generator("cpu").manual_seed(seed)
181
+
182
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
183
+ history_pixels = None
184
+
185
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
186
+ total_generated_latent_frames = 1
187
+
188
+ for section_index in range(total_latent_sections):
189
+ if stream.input_queue.top() == 'end':
190
+ stream.output_queue.push(('end', None))
191
+ return
192
+
193
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
194
+
195
+ if not high_vram:
196
+ unload_complete_models()
197
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
198
+
199
+ if use_teacache:
200
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
201
+ else:
202
+ transformer.initialize_teacache(enable_teacache=False)
203
+
204
+ def callback(d):
205
+ preview = d['denoised']
206
+ preview = vae_decode_fake(preview)
207
+
208
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
209
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
210
+
211
+ if stream.input_queue.top() == 'end':
212
+ stream.output_queue.push(('end', None))
213
+ raise KeyboardInterrupt('User ends the task.')
214
+
215
+ current_step = d['i'] + 1
216
+ percentage = int(100.0 * current_step / steps)
217
+ hint = f'Sampling {current_step}/{steps}'
218
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
219
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
220
+ return
221
+
222
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
223
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
224
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
225
+
226
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
227
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
228
+
229
+ generated_latents = sample_hunyuan(
230
+ transformer=transformer,
231
+ sampler='unipc',
232
+ width=width,
233
+ height=height,
234
+ frames=latent_window_size * 4 - 3,
235
+ real_guidance_scale=cfg,
236
+ distilled_guidance_scale=gs,
237
+ guidance_rescale=rs,
238
+ # shift=3.0,
239
+ num_inference_steps=steps,
240
+ generator=rnd,
241
+ prompt_embeds=llama_vec,
242
+ prompt_embeds_mask=llama_attention_mask,
243
+ prompt_poolers=clip_l_pooler,
244
+ negative_prompt_embeds=llama_vec_n,
245
+ negative_prompt_embeds_mask=llama_attention_mask_n,
246
+ negative_prompt_poolers=clip_l_pooler_n,
247
+ device=gpu,
248
+ dtype=torch.bfloat16,
249
+ image_embeddings=image_encoder_last_hidden_state,
250
+ latent_indices=latent_indices,
251
+ clean_latents=clean_latents,
252
+ clean_latent_indices=clean_latent_indices,
253
+ clean_latents_2x=clean_latents_2x,
254
+ clean_latent_2x_indices=clean_latent_2x_indices,
255
+ clean_latents_4x=clean_latents_4x,
256
+ clean_latent_4x_indices=clean_latent_4x_indices,
257
+ callback=callback,
258
+ )
259
+
260
+ total_generated_latent_frames += int(generated_latents.shape[2])
261
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
262
+
263
+ if not high_vram:
264
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
265
+ load_model_as_complete(vae, target_device=gpu)
266
+
267
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
268
+
269
+ if history_pixels is None:
270
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
271
+ else:
272
+ section_latent_frames = latent_window_size * 2
273
+ overlapped_frames = latent_window_size * 4 - 3
274
+
275
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
276
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
277
+
278
+ if not high_vram:
279
+ unload_complete_models()
280
+
281
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
282
+
283
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
284
+
285
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
286
+
287
+ stream.output_queue.push(('file', output_filename))
288
+ except:
289
+ traceback.print_exc()
290
+
291
+ if not high_vram:
292
+ unload_complete_models(
293
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
294
+ )
295
+
296
+ stream.output_queue.push(('end', None))
297
+ return
298
+
299
+
300
+ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
301
+ global stream
302
+ assert input_image is not None, 'No input image!'
303
+
304
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
305
+
306
+ stream = AsyncStream()
307
+
308
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
309
+
310
+ output_filename = None
311
+
312
+ while True:
313
+ flag, data = stream.output_queue.next()
314
+
315
+ if flag == 'file':
316
+ output_filename = data
317
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
318
+
319
+ if flag == 'progress':
320
+ preview, desc, html = data
321
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
322
+
323
+ if flag == 'end':
324
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
325
+ break
326
+
327
+
328
+ def end_process():
329
+ stream.input_queue.push('end')
330
+
331
+
332
+ quick_prompts = [
333
+ 'The girl dances gracefully, with clear movements, full of charm.',
334
+ 'A character doing some simple body movements.',
335
+ ]
336
+ quick_prompts = [[x] for x in quick_prompts]
337
+
338
+
339
+ css = make_progress_bar_css()
340
+ block = gr.Blocks(css=css).queue()
341
+ with block:
342
+ gr.Markdown('# FramePack-F1')
343
+ with gr.Row():
344
+ with gr.Column():
345
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
346
+ prompt = gr.Textbox(label="Prompt", value='')
347
+ example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
348
+ example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
349
+
350
+ with gr.Row():
351
+ start_button = gr.Button(value="Start Generation")
352
+ end_button = gr.Button(value="End Generation", interactive=False)
353
+
354
+ with gr.Group():
355
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
356
+
357
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
358
+ seed = gr.Number(label="Seed", value=31337, precision=0)
359
+
360
+ total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
361
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
362
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
363
+
364
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
365
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
366
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
367
+
368
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
369
+
370
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
371
+
372
+ with gr.Column():
373
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
374
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
375
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
376
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
377
+
378
+ gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
379
+
380
+ ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
381
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
382
+ end_button.click(fn=end_process)
383
+
384
+
385
+ block.launch(
386
+ server_name=args.server,
387
+ server_port=args.port,
388
+ share=args.share,
389
+ inbrowser=args.inbrowser,
390
+ )