Spaces:
Running
Running
del
Browse files
app.py
CHANGED
@@ -468,8 +468,8 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
468 |
return [start_latent, image_encoder_last_hidden_state]
|
469 |
|
470 |
[start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
471 |
-
input_image
|
472 |
-
end_image
|
473 |
|
474 |
# Dtype
|
475 |
|
@@ -565,7 +565,7 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
565 |
[llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
|
566 |
|
567 |
if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
|
568 |
-
prompt_parameters[prompt_index]
|
569 |
|
570 |
if not high_vram:
|
571 |
unload_complete_models()
|
@@ -613,6 +613,13 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
613 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
614 |
callback=callback,
|
615 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
617 |
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
618 |
|
@@ -626,7 +633,8 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
626 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
627 |
zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
|
628 |
history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
|
629 |
-
real_history_latents
|
|
|
630 |
|
631 |
forward = True
|
632 |
section_index = first_section_index
|
@@ -754,8 +762,8 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
754 |
return [start_latent, end_latent, image_encoder_last_hidden_state]
|
755 |
|
756 |
[start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
|
757 |
-
input_image
|
758 |
-
end_image
|
759 |
|
760 |
# Dtype
|
761 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
@@ -905,6 +913,13 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
905 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
906 |
callback=callback,
|
907 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
908 |
|
909 |
[total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
|
910 |
|
@@ -949,7 +964,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
949 |
|
950 |
# 20250506 pftq: Encode video
|
951 |
start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
|
952 |
-
input_video
|
953 |
start_latent = start_latent.to(dtype=torch.float32, device=cpu)
|
954 |
video_latents = video_latents.cpu()
|
955 |
|
@@ -987,7 +1002,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
987 |
load_model_as_complete(image_encoder, target_device=gpu)
|
988 |
|
989 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
990 |
-
input_image_np
|
991 |
|
992 |
# 20250507 pftq: Process end frame if provided
|
993 |
if end_frame is not None:
|
@@ -999,7 +1014,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
999 |
end_frame, target_width=width, target_height=height, vae=vae,
|
1000 |
image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
|
1001 |
)[0]
|
1002 |
-
end_frame
|
1003 |
end_latent = end_latent.to(dtype=torch.float32, device=cpu)
|
1004 |
else:
|
1005 |
end_latent = None
|
@@ -1009,7 +1024,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
1009 |
unload_complete_models(image_encoder, vae)
|
1010 |
|
1011 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
1012 |
-
image_encoder_output
|
1013 |
|
1014 |
# Dtype
|
1015 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
@@ -1119,8 +1134,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
1119 |
history_latents = video_latents
|
1120 |
total_generated_latent_frames = history_latents.shape[2]
|
1121 |
# 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
|
1122 |
-
history_pixels = None
|
1123 |
-
previous_video = None
|
1124 |
|
1125 |
# 20250509 Generate backwards with end frame for better end frame anchoring
|
1126 |
if total_latent_sections > 4:
|
@@ -1181,13 +1195,13 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
1181 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
1182 |
callback=callback,
|
1183 |
)
|
1184 |
-
clean_latents
|
1185 |
-
clean_latents_2x
|
1186 |
-
clean_latents_4x
|
1187 |
-
latent_indices
|
1188 |
-
clean_latent_indices
|
1189 |
-
clean_latent_2x_indices
|
1190 |
-
clean_latent_4x_indices
|
1191 |
|
1192 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
1193 |
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|
|
|
468 |
return [start_latent, image_encoder_last_hidden_state]
|
469 |
|
470 |
[start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
471 |
+
del input_image
|
472 |
+
del end_image
|
473 |
|
474 |
# Dtype
|
475 |
|
|
|
565 |
[llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
|
566 |
|
567 |
if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
|
568 |
+
del prompt_parameters[prompt_index]
|
569 |
|
570 |
if not high_vram:
|
571 |
unload_complete_models()
|
|
|
613 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
614 |
callback=callback,
|
615 |
)
|
616 |
+
del clean_latents
|
617 |
+
del clean_latents_2x
|
618 |
+
del clean_latents_4x
|
619 |
+
del latent_indices
|
620 |
+
del clean_latent_indices
|
621 |
+
del clean_latent_2x_indices
|
622 |
+
del clean_latent_4x_indices
|
623 |
|
624 |
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
625 |
|
|
|
633 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
634 |
zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
|
635 |
history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
|
636 |
+
del real_history_latents
|
637 |
+
del zero_latents
|
638 |
|
639 |
forward = True
|
640 |
section_index = first_section_index
|
|
|
762 |
return [start_latent, end_latent, image_encoder_last_hidden_state]
|
763 |
|
764 |
[start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
|
765 |
+
del input_image
|
766 |
+
del end_image
|
767 |
|
768 |
# Dtype
|
769 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
|
|
913 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
914 |
callback=callback,
|
915 |
)
|
916 |
+
del clean_latents
|
917 |
+
del clean_latents_2x
|
918 |
+
del clean_latents_4x
|
919 |
+
del latent_indices
|
920 |
+
del clean_latent_indices
|
921 |
+
del clean_latent_2x_indices
|
922 |
+
del clean_latent_4x_indices
|
923 |
|
924 |
[total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
|
925 |
|
|
|
964 |
|
965 |
# 20250506 pftq: Encode video
|
966 |
start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
|
967 |
+
del input_video
|
968 |
start_latent = start_latent.to(dtype=torch.float32, device=cpu)
|
969 |
video_latents = video_latents.cpu()
|
970 |
|
|
|
1002 |
load_model_as_complete(image_encoder, target_device=gpu)
|
1003 |
|
1004 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
1005 |
+
del input_image_np
|
1006 |
|
1007 |
# 20250507 pftq: Process end frame if provided
|
1008 |
if end_frame is not None:
|
|
|
1014 |
end_frame, target_width=width, target_height=height, vae=vae,
|
1015 |
image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
|
1016 |
)[0]
|
1017 |
+
del end_frame
|
1018 |
end_latent = end_latent.to(dtype=torch.float32, device=cpu)
|
1019 |
else:
|
1020 |
end_latent = None
|
|
|
1024 |
unload_complete_models(image_encoder, vae)
|
1025 |
|
1026 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
1027 |
+
del image_encoder_output
|
1028 |
|
1029 |
# Dtype
|
1030 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
|
|
1134 |
history_latents = video_latents
|
1135 |
total_generated_latent_frames = history_latents.shape[2]
|
1136 |
# 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
|
1137 |
+
history_pixels = previous_video = None
|
|
|
1138 |
|
1139 |
# 20250509 Generate backwards with end frame for better end frame anchoring
|
1140 |
if total_latent_sections > 4:
|
|
|
1195 |
clean_latent_4x_indices=clean_latent_4x_indices,
|
1196 |
callback=callback,
|
1197 |
)
|
1198 |
+
del clean_latents
|
1199 |
+
del clean_latents_2x
|
1200 |
+
del clean_latents_4x
|
1201 |
+
del latent_indices
|
1202 |
+
del clean_latent_indices
|
1203 |
+
del clean_latent_2x_indices
|
1204 |
+
del clean_latent_4x_indices
|
1205 |
|
1206 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
1207 |
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|