Serefor commited on
Commit
f74cb43
·
verified ·
1 Parent(s): 5346862

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +666 -95
app.py CHANGED
@@ -11,103 +11,674 @@ import tempfile
11
  from PIL import Image
12
  from huggingface_hub import hf_hub_download
13
  import shutil
14
- from diffusers import LTXImageToVideoPipeline # o LTXConditionPipeline según versión
15
-
16
- # -------------------------
17
- # 📦 Descargar y cargar modelo
18
- # -------------------------
19
- MODEL_ID = "Lightricks/LTX-Video"
20
- CKPT_FILE = "ltxv-2b-0.9.6-distilled-04-25.safetensors"
21
-
22
- local_ckpt = hf_hub_download(
23
- repo_id=MODEL_ID,
24
- filename=CKPT_FILE,
25
- cache_dir="./models",
26
- local_dir_use_symlinks=False
27
  )
28
- pipe = LTXImageToVideoPipeline.from_pretrained(
29
- MODEL_ID,
30
- checkpoint_path=local_ckpt,
31
- torch_dtype=torch.bfloat16
32
- ).to("cuda" if torch.cuda.is_available() else "cpu")
33
-
34
- # -------------------------
35
- # 🔧 Funciones de generación
36
- # -------------------------
37
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
-
39
- def txt2vid(prompt, height, width, num_frames, steps, seed=None):
40
- seed = seed or random.randint(0, 2**32 - 1)
41
- generator = torch.Generator(device=device).manual_seed(seed)
42
- out = pipe(
43
- prompt=prompt,
44
- height=height,
45
- width=width,
46
- num_frames=num_frames,
47
- num_inference_steps=steps,
48
- generator=generator
49
- )
50
- vid = out.videos[0]
51
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
52
- imageio.mimwrite(tmp.name, vid, fps=25)
53
- return tmp.name
54
-
55
- def img2vid(image, prompt, height, width, num_frames, steps, seed=None):
56
- img = Image.fromarray(image)
57
- # 🎥 Guardar un frame como video de 1 cuadro
58
- tmp_cond = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
59
- cond_path = tmp_cond.name
60
- imageio.mimwrite(cond_path, [np.array(img)], fps=1)
61
-
62
- seed = seed or random.randint(0, 2**32 - 1)
63
- generator = torch.Generator(device=device).manual_seed(seed)
64
- out = pipe(
65
- prompt=prompt,
66
- height=height,
67
- width=width,
68
- num_frames=num_frames,
69
- num_inference_steps=steps,
70
- generator=generator,
71
- conditioning_media_paths=[cond_path],
72
- conditioning_start_frames=[0]
73
- )
74
- vid = out.videos[0]
75
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
76
- imageio.mimwrite(tmp.name, vid, fps=25)
77
- return tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # -------------------------
81
- # 🎨 Interfaz Gradio
82
- # -------------------------
83
- css = """body { background-color:#111; color:#eee } .gradio-container { max-width:800px; }"""
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Blocks(css=css) as demo:
86
- gr.Markdown("# LTX‑Video 2B Distilled (Gratuito)")
87
-
88
- with gr.Tab("Text → Video"):
89
- t_prompt = gr.Textbox(label="Prompt", value="A serene landscape at sunrise")
90
- t_h = gr.Slider(128, 720, value=512, step=32, label="Height")
91
- t_w = gr.Slider(128, 1280, value=768, step=32, label="Width")
92
- t_f = gr.Slider(9, 257, value=65, step=8, label="Num Frames")
93
- t_s = gr.Slider(4, 16, value=8, step=1, label="Steps")
94
- t_seed = gr.Number(label="Seed (opcional)", value=0)
95
- t_btn = gr.Button("Generate")
96
- t_out = gr.Video()
97
- t_btn.click(fn=txt2vid, inputs=[t_prompt, t_h, t_w, t_f, t_s, t_seed], outputs=t_out)
98
-
99
- with gr.Tab("Image Video"):
100
- i_img = gr.Image(type="numpy")
101
- i_prompt = gr.Textbox(label="Prompt", value="A cute fox in the snow")
102
- i_h = gr.Slider(128, 720, value=512, step=32, label="Height")
103
- i_w = gr.Slider(128, 1280, value=768, step=32, label="Width")
104
- i_f = gr.Slider(9, 257, value=65, step=8, label="Num Frames")
105
- i_s = gr.Slider(4, 16, value=8, step=1, label="Steps")
106
- i_seed = gr.Number(label="Seed (opcional)", value=0)
107
- i_btn = gr.Button("Generate")
108
- i_out = gr.Video()
109
- i_btn.click(fn=img2vid, inputs=[i_img, i_prompt, i_h, i_w, i_f, i_s, i_seed], outputs=i_out)
110
-
111
- gr.Markdown("**Modelo:** ltxv‑2b‑0.9.6‑distilled resolución múltiplo de 32, frames múltiplo de 8+1 :contentReference[oaicite:1]{index=1}")
112
-
113
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from PIL import Image
12
  from huggingface_hub import hf_hub_download
13
  import shutil
14
+
15
+ from inference import (
16
+ create_ltx_video_pipeline,
17
+ create_latent_upsampler,
18
+ load_image_to_tensor_with_resize_and_crop,
19
+ seed_everething,
20
+ get_device,
21
+ calculate_padding,
22
+ load_media_file
 
 
 
 
23
  )
24
+ from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
25
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
26
+
27
+ # Configuración del modelo gratuito optimizada
28
+ config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
29
+
30
+ # Alternativas de modelos gratuitos que puedes usar:
31
+ AVAILABLE_FREE_MODELS = {
32
+ "ltx-video": {
33
+ "repo": "Lightricks/LTX-Video",
34
+ "config": "configs/ltxv-13b-0.9.7-distilled.yaml"
35
+ },
36
+ "zeroscope": {
37
+ "repo": "cerspense/zeroscope_v2_576w",
38
+ "config": None # Usar configuración por defecto
39
+ },
40
+ "animatediff": {
41
+ "repo": "guoyww/animatediff-motion-adapter-v1-5-2",
42
+ "config": None
43
+ }
44
+ }
45
+
46
+ # Configuración del modelo seleccionado
47
+ SELECTED_MODEL = "ltx-video" # Cambia esto por el modelo que prefieras
48
+ MODEL_CONFIG = AVAILABLE_FREE_MODELS[SELECTED_MODEL]
49
+
50
+ # Cargar configuración
51
+ if MODEL_CONFIG["config"]:
52
+ with open(MODEL_CONFIG["config"], "r") as file:
53
+ PIPELINE_CONFIG_YAML = yaml.safe_load(file)
54
+ else:
55
+ # Configuración por defecto para modelos sin config específico
56
+ PIPELINE_CONFIG_YAML = {
57
+ "max_resolution": 1280,
58
+ "checkpoint_path": "model.safetensors",
59
+ "precision": "bfloat16",
60
+ "text_encoder_model_name_or_path": "google/flan-t5-xl",
61
+ "sampler": "from_checkpoint",
62
+ "spatial_upscaler_model_path": None,
63
+ "decode_timestep": 0.0,
64
+ "decode_noise_scale": 0.0,
65
+ "stochastic_sampling": False,
66
+ "first_pass": {
67
+ "guidance_scale": 3.0,
68
+ "timesteps": None,
69
+ "stg_scale": 0.0,
70
+ "rescaling_scale": 1.0,
71
+ "skip_block_list": None
72
+ }
73
+ }
74
+
75
+ LTX_REPO = MODEL_CONFIG["repo"]
76
+ MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
77
+ MAX_NUM_FRAMES = 257
78
+
79
+ FPS = 30.0
80
+
81
+ # Variables globales para modelos cargados
82
+ pipeline_instance = None
83
+ latent_upsampler_instance = None
84
+ models_dir = "downloaded_models_gradio_cpu_init"
85
+ Path(models_dir).mkdir(parents=True, exist_ok=True)
86
+
87
+ def setup_free_model():
88
+ """Configura el modelo gratuito seleccionado"""
89
+ global pipeline_instance, latent_upsampler_instance
90
+
91
+ print(f"Configurando modelo gratuito: {SELECTED_MODEL}")
92
+ print(f"Repositorio: {LTX_REPO}")
93
+
94
+ try:
95
+ # Descargar modelo principal
96
+ print("Descargando modelo principal (si no está presente)...")
97
+ if SELECTED_MODEL == "ltx-video":
98
+ distilled_model_actual_path = hf_hub_download(
99
+ repo_id=LTX_REPO,
100
+ filename=PIPELINE_CONFIG_YAML["checkpoint_path"],
101
+ local_dir=models_dir,
102
+ local_dir_use_symlinks=False
103
+ )
104
+ PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
105
+ print(f"Ruta del modelo: {distilled_model_actual_path}")
106
+
107
+ # Descargar upscaler espacial si está disponible
108
+ if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
109
+ SPATIAL_UPSCALER_FILENAME = PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]
110
+ spatial_upscaler_actual_path = hf_hub_download(
111
+ repo_id=LTX_REPO,
112
+ filename=SPATIAL_UPSCALER_FILENAME,
113
+ local_dir=models_dir,
114
+ local_dir_use_symlinks=False
115
+ )
116
+ PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
117
+ print(f"Ruta del upscaler espacial: {spatial_upscaler_actual_path}")
118
+
119
+ elif SELECTED_MODEL == "zeroscope":
120
+ # Configuración específica para Zeroscope
121
+ print("Configurando Zeroscope...")
122
+ # Zeroscope usa una configuración diferente
123
+ from diffusers import DiffusionPipeline
124
+ pipeline_instance = DiffusionPipeline.from_pretrained(
125
+ LTX_REPO,
126
+ torch_dtype=torch.float16
127
+ )
128
+ return
129
+
130
+ elif SELECTED_MODEL == "animatediff":
131
+ # Configuración específica para AnimateDiff
132
+ print("Configurando AnimateDiff...")
133
+ from diffusers import AnimateDiffPipeline, MotionAdapter
134
+ adapter = MotionAdapter.from_pretrained(LTX_REPO)
135
+ pipeline_instance = AnimateDiffPipeline.from_pretrained(
136
+ "runwayml/stable-diffusion-v1-5",
137
+ motion_adapter=adapter,
138
+ torch_dtype=torch.float16
139
+ )
140
+ return
141
+
142
+ # Crear pipeline LTX Video en CPU
143
+ print("Creando pipeline LTX Video en CPU...")
144
+ pipeline_instance = create_ltx_video_pipeline(
145
+ ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
146
+ precision=PIPELINE_CONFIG_YAML["precision"],
147
+ text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
148
+ sampler=PIPELINE_CONFIG_YAML["sampler"],
149
+ device="cpu",
150
+ enhance_prompt=False,
151
+ prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML.get("prompt_enhancer_image_caption_model_name_or_path"),
152
+ prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML.get("prompt_enhancer_llm_model_name_or_path"),
153
+ )
154
+ print("Pipeline LTX Video creado en CPU.")
155
+
156
+ # Crear upsampler latente si está disponible
157
+ if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
158
+ print("Creando upsampler latente en CPU...")
159
+ latent_upsampler_instance = create_latent_upsampler(
160
+ PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
161
+ device="cpu"
162
+ )
163
+ print("Upsampler latente creado en CPU.")
164
+
165
+ # Mover a dispositivo de inferencia
166
+ target_inference_device = "cuda" if torch.cuda.is_available() else "cpu"
167
+ print(f"Dispositivo de inferencia objetivo: {target_inference_device}")
168
+
169
+ pipeline_instance.to(target_inference_device)
170
+ if latent_upsampler_instance:
171
+ latent_upsampler_instance.to(target_inference_device)
172
+
173
+ except Exception as e:
174
+ print(f"Error configurando el modelo: {e}")
175
+ print("Intentando configuración alternativa...")
176
+ # Configuración de respaldo
177
+ setup_fallback_model()
178
+
179
+ def setup_fallback_model():
180
+ """Configuración de respaldo usando un modelo más simple"""
181
+ global pipeline_instance
182
+ print("Configurando modelo de respaldo...")
183
+
184
+ try:
185
+ from diffusers import DiffusionPipeline
186
+ # Usar un modelo más ligero como respaldo
187
+ pipeline_instance = DiffusionPipeline.from_pretrained(
188
+ "cerspense/zeroscope_v2_576w",
189
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
190
+ )
191
+ print("Modelo de respaldo configurado exitosamente.")
192
+ except Exception as e:
193
+ print(f"Error configurando modelo de respaldo: {e}")
194
+ raise
195
+
196
+ # Configurar el modelo
197
+ setup_free_model()
198
+
199
+ # Función para cambiar de modelo dinámicamente
200
+ def switch_model(model_name):
201
+ """Cambia dinámicamente entre modelos disponibles"""
202
+ global SELECTED_MODEL, pipeline_instance, latent_upsampler_instance
203
+
204
+ if model_name not in AVAILABLE_FREE_MODELS:
205
+ raise ValueError(f"Modelo {model_name} no está disponible")
206
+
207
+ print(f"Cambiando a modelo: {model_name}")
208
+ SELECTED_MODEL = model_name
209
+
210
+ # Limpiar memoria
211
+ if pipeline_instance:
212
+ del pipeline_instance
213
+ if latent_upsampler_instance:
214
+ del latent_upsampler_instance
215
+
216
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
217
+
218
+ # Reconfigurar con el nuevo modelo
219
+ setup_free_model()
220
+
221
+ return f"Modelo cambiado a: {model_name}"
222
+
223
+ # Resto del código permanece igual...
224
+ MIN_DIM_SLIDER = 256
225
+ TARGET_FIXED_SIDE = 768
226
+
227
+ def calculate_new_dimensions(orig_w, orig_h):
228
+ """
229
+ Calcula nuevas dimensiones para los sliders de altura y anchura basándose en las dimensiones originales del medio.
230
+ """
231
+ if orig_w == 0 or orig_h == 0:
232
+ return int(TARGET_FIXED_SIDE), int(TARGET_FIXED_SIDE)
233
+
234
+ if orig_w >= orig_h: # Paisaje o cuadrado
235
+ new_h = TARGET_FIXED_SIDE
236
+ aspect_ratio = orig_w / orig_h
237
+ new_w_ideal = new_h * aspect_ratio
238
+
239
+ new_w = round(new_w_ideal / 32) * 32
240
+ new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
241
+ new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
242
+ else: # Retrato
243
+ new_w = TARGET_FIXED_SIDE
244
+ aspect_ratio = orig_h / orig_w
245
+ new_h_ideal = new_w * aspect_ratio
246
+
247
+ new_h = round(new_h_ideal / 32) * 32
248
+ new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
249
+ new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
250
+
251
+ return int(new_h), int(new_w)
252
+
253
+ def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
254
+ height_ui, width_ui, mode,
255
+ duration_ui,
256
+ ui_frames_to_use,
257
+ seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
258
+ progress):
259
+ # Optimización para recursos limitados
260
+ if duration_ui > 5: # Reducido de 7 a 5 para modelos gratuitos
261
+ return 60 # Reducido de 75 a 60
262
+ else:
263
+ return 45 # Reducido de 60 a 45
264
+
265
+ @spaces.GPU(duration=get_duration)
266
+ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
267
+ height_ui, width_ui, mode,
268
+ duration_ui,
269
+ ui_frames_to_use,
270
+ seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
271
+ progress=gr.Progress(track_tqdm=True)):
272
+
273
+ if randomize_seed:
274
+ seed_ui = random.randint(0, 2**32 - 1)
275
+ seed_everething(int(seed_ui))
276
+
277
+ # Optimizar para modelos gratuitos
278
+ target_frames_ideal = min(duration_ui * FPS, 120) # Limitar frames para recursos
279
+ target_frames_rounded = round(target_frames_ideal)
280
+ if target_frames_rounded < 1:
281
+ target_frames_rounded = 1
282
+
283
+ n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
284
+ actual_num_frames = int(n_val * 8 + 1)
285
+
286
+ actual_num_frames = max(9, actual_num_frames)
287
+ actual_num_frames = min(MAX_NUM_FRAMES, actual_num_frames)
288
+
289
+ # Optimizar resolución para modelos gratuitos
290
+ actual_height = min(int(height_ui), 512) # Limitar altura
291
+ actual_width = min(int(width_ui), 768) # Limitar anchura
292
+
293
+ height_padded = ((actual_height - 1) // 32 + 1) * 32
294
+ width_padded = ((actual_width - 1) // 32 + 1) * 32
295
+ num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
296
+
297
+ padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
298
+
299
+ # Configuración optimizada para modelos gratuitos
300
+ call_kwargs = {
301
+ "prompt": prompt,
302
+ "negative_prompt": negative_prompt,
303
+ "height": height_padded,
304
+ "width": width_padded,
305
+ "num_frames": num_frames_padded,
306
+ "frame_rate": int(FPS),
307
+ "generator": torch.Generator(device=get_device()).manual_seed(int(seed_ui)),
308
+ "output_type": "pt",
309
+ "conditioning_items": None,
310
+ "media_items": None,
311
+ "decode_timestep": PIPELINE_CONFIG_YAML.get("decode_timestep", 0.0),
312
+ "decode_noise_scale": PIPELINE_CONFIG_YAML.get("decode_noise_scale", 0.0),
313
+ "stochastic_sampling": PIPELINE_CONFIG_YAML.get("stochastic_sampling", False),
314
+ "image_cond_noise_scale": 0.15,
315
+ "is_video": True,
316
+ "vae_per_channel_normalize": True,
317
+ "mixed_precision": (PIPELINE_CONFIG_YAML.get("precision") == "mixed_precision"),
318
+ "offload_to_cpu": True, # Activar para ahorrar memoria
319
+ "enhance_prompt": False,
320
+ }
321
+
322
+ # Configurar estrategia de capa de salto
323
+ stg_mode_str = PIPELINE_CONFIG_YAML.get("stg_mode", "attention_values")
324
+ if stg_mode_str.lower() in ["stg_av", "attention_values"]:
325
+ call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.AttentionValues
326
+ elif stg_mode_str.lower() in ["stg_as", "attention_skip"]:
327
+ call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.AttentionSkip
328
+ elif stg_mode_str.lower() in ["stg_r", "residual"]:
329
+ call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.Residual
330
+ elif stg_mode_str.lower() in ["stg_t", "transformer_block"]:
331
+ call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.TransformerBlock
332
+
333
+ # Procesar entrada de imagen o video
334
+ target_inference_device = get_device()
335
+
336
+ if mode == "image-to-video" and input_image_filepath:
337
+ try:
338
+ media_tensor = load_image_to_tensor_with_resize_and_crop(
339
+ input_image_filepath, actual_height, actual_width
340
+ )
341
+ media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
342
+ call_kwargs["conditioning_items"] = [ConditioningItem(media_tensor.to(target_inference_device), 0, 1.0)]
343
+ except Exception as e:
344
+ print(f"Error cargando imagen {input_image_filepath}: {e}")
345
+ raise gr.Error(f"No se pudo cargar la imagen: {e}")
346
+
347
+ elif mode == "video-to-video" and input_video_filepath:
348
+ try:
349
+ call_kwargs["media_items"] = load_media_file(
350
+ media_path=input_video_filepath,
351
+ height=actual_height,
352
+ width=actual_width,
353
+ max_frames=int(ui_frames_to_use),
354
+ padding=padding_values
355
+ ).to(target_inference_device)
356
+ except Exception as e:
357
+ print(f"Error cargando video {input_video_filepath}: {e}")
358
+ raise gr.Error(f"No se pudo cargar el video: {e}")
359
+
360
+ print(f"Moviendo modelos a {target_inference_device} para inferencia...")
361
+
362
+ # Generar video
363
+ result_images_tensor = None
364
+ try:
365
+ if improve_texture_flag and latent_upsampler_instance:
366
+ # Usar pipeline multi-escala
367
+ multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
368
+
369
+ first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
370
+ first_pass_args["guidance_scale"] = float(ui_guidance_scale)
371
+ first_pass_args.pop("num_inference_steps", None)
372
+
373
+ second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
374
+ second_pass_args["guidance_scale"] = float(ui_guidance_scale)
375
+ second_pass_args.pop("num_inference_steps", None)
376
+
377
+ multi_scale_call_kwargs = call_kwargs.copy()
378
+ multi_scale_call_kwargs.update({
379
+ "downscale_factor": PIPELINE_CONFIG_YAML.get("downscale_factor", 2),
380
+ "first_pass": first_pass_args,
381
+ "second_pass": second_pass_args,
382
+ })
383
+
384
+ print(f"Llamando pipeline multi-escala...")
385
+ result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
386
+ else:
387
+ # Usar pipeline simple
388
+ single_pass_call_kwargs = call_kwargs.copy()
389
+ first_pass_config = PIPELINE_CONFIG_YAML.get("first_pass", {})
390
+
391
+ single_pass_call_kwargs["timesteps"] = first_pass_config.get("timesteps")
392
+ single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
393
+ single_pass_call_kwargs["stg_scale"] = first_pass_config.get("stg_scale", 0.0)
394
+ single_pass_call_kwargs["rescaling_scale"] = first_pass_config.get("rescaling_scale", 1.0)
395
+ single_pass_call_kwargs["skip_block_list"] = first_pass_config.get("skip_block_list")
396
+
397
+ print(f"Llamando pipeline base...")
398
+ result_images_tensor = pipeline_instance(**single_pass_call_kwargs).images
399
+
400
+ except Exception as e:
401
+ print(f"Error en la generación: {e}")
402
+ raise gr.Error(f"Error en la generación: {e}")
403
+
404
+ if result_images_tensor is None:
405
+ raise gr.Error("La generación falló.")
406
+
407
+ # Procesar resultado
408
+ pad_left, pad_right, pad_top, pad_bottom = padding_values
409
+ slice_h_end = -pad_bottom if pad_bottom > 0 else None
410
+ slice_w_end = -pad_right if pad_right > 0 else None
411
 
412
+ result_images_tensor = result_images_tensor[
413
+ :, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end
414
+ ]
415
+
416
+ video_np = result_images_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
417
+ video_np = np.clip(video_np, 0, 1)
418
+ video_np = (video_np * 255).astype(np.uint8)
419
+
420
+ # Guardar video
421
+ temp_dir = tempfile.mkdtemp()
422
+ timestamp = random.randint(10000,99999)
423
+ output_video_path = os.path.join(temp_dir, f"output_{timestamp}.mp4")
424
+
425
+ try:
426
+ with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], macro_block_size=1) as video_writer:
427
+ for frame_idx in range(video_np.shape[0]):
428
+ progress(frame_idx / video_np.shape[0], desc="Guardando video")
429
+ video_writer.append_data(video_np[frame_idx])
430
+ except Exception as e:
431
+ print(f"Error guardando video: {e}")
432
+ try:
433
+ with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], format='FFMPEG', codec='libx264', quality=8) as video_writer:
434
+ for frame_idx in range(video_np.shape[0]):
435
+ progress(frame_idx / video_np.shape[0], desc="Guardando video (respaldo)")
436
+ video_writer.append_data(video_np[frame_idx])
437
+ except Exception as e2:
438
+ print(f"Error en respaldo de guardado: {e2}")
439
+ raise gr.Error(f"Error guardando video: {e2}")
440
+
441
+ return output_video_path, seed_ui
442
+
443
+ # Funciones de actualización de tarea
444
+ def update_task_image():
445
+ return "image-to-video"
446
 
447
+ def update_task_text():
448
+ return "text-to-video"
 
 
449
 
450
+ def update_task_video():
451
+ return "video-to-video"
452
+
453
+ # CSS para la interfaz
454
+ css="""
455
+ #col-container {
456
+ margin: 0 auto;
457
+ max-width: 900px;
458
+ }
459
+ .model-info {
460
+ background: #f0f0f0;
461
+ padding: 10px;
462
+ border-radius: 5px;
463
+ margin-bottom: 10px;
464
+ }
465
+ """
466
+
467
+ # Interfaz Gradio
468
  with gr.Blocks(css=css) as demo:
469
+ gr.Markdown("# Generador de Video LTX - Modelos Gratuitos")
470
+ gr.Markdown("Generación de video de alta calidad usando modelos completamente gratuitos.")
471
+
472
+ with gr.Row():
473
+ with gr.Column():
474
+ # Selector de modelo
475
+ with gr.Accordion("Configuración de Modelo", open=False):
476
+ model_selector = gr.Dropdown(
477
+ choices=list(AVAILABLE_FREE_MODELS.keys()),
478
+ value=SELECTED_MODEL,
479
+ label="Modelo a usar",
480
+ info="Todos los modelos son completamente gratuitos"
481
+ )
482
+ model_info = gr.Markdown(f"**Modelo actual:** {SELECTED_MODEL}\n**Repositorio:** {LTX_REPO}", elem_classes="model-info")
483
+ switch_btn = gr.Button("Cambiar Modelo", variant="secondary")
484
+
485
+ with gr.Tab("imagen-a-video") as image_tab:
486
+ video_i_hidden = gr.Textbox(label="video_i", visible=False, value=None)
487
+ image_i2v = gr.Image(label="Imagen de Entrada", type="filepath", sources=["upload", "webcam", "clipboard"])
488
+ i2v_prompt = gr.Textbox(label="Prompt", value="La criatura de la imagen comienza a moverse", lines=3)
489
+ i2v_button = gr.Button("Generar Imagen-a-Video", variant="primary")
490
+
491
+ with gr.Tab("texto-a-video") as text_tab:
492
+ image_n_hidden = gr.Textbox(label="image_n", visible=False, value=None)
493
+ video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
494
+ t2v_prompt = gr.Textbox(label="Prompt", value="Un majestuoso dragón volando sobre un castillo medieval", lines=3)
495
+ t2v_button = gr.Button("Generar Texto-a-Video", variant="primary")
496
+
497
+ with gr.Tab("video-a-video", visible=False) as video_tab:
498
+ image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
499
+ video_v2v = gr.Video(label="Video de Entrada", sources=["upload", "webcam"])
500
+ frames_to_use = gr.Slider(label="Frames a usar del video de entrada", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8)
501
+ v2v_prompt = gr.Textbox(label="Prompt", value="Cambiar el estilo a anime cinematográfico", lines=3)
502
+ v2v_button = gr.Button("Generar Video-a-Video", variant="primary")
503
+
504
+ duration_input = gr.Slider(
505
+ label="Duración del Video (segundos)",
506
+ minimum=0.3,
507
+ maximum=5.0, # Reducido para modelos gratuitos
508
+ value=2,
509
+ step=0.1,
510
+ info="Duración objetivo del video (0.3s a 5.0s)"
511
+ )
512
+ improve_texture = gr.Checkbox(
513
+ label="Mejorar Textura (multi-escala)",
514
+ value=False, # Desactivado por defecto para ahorrar recursos
515
+ info="Usa generación de dos pasadas para mejor calidad, pero es más lento."
516
+ )
517
+
518
+ with gr.Column():
519
+ output_video = gr.Video(label="Video Generado", interactive=False)
520
+
521
+ with gr.Accordion("Configuración Avanzada", open=False):
522
+ mode = gr.Dropdown(["texto-a-video", "imagen-a-video", "video-a-video"], label="tarea", value="imagen-a-video", visible=False)
523
+ negative_prompt_input = gr.Textbox(
524
+ label="Prompt Negativo",
525
+ value="peor calidad, movimiento inconsistente, borroso, tembloroso, distorsionado",
526
+ lines=2
527
+ )
528
+ with gr.Row():
529
+ seed_input = gr.Number(label="Semilla", value=42, precision=0, minimum=0, maximum=2**32-1)
530
+ randomize_seed_input = gr.Checkbox(label="Semilla Aleatoria", value=True)
531
+ with gr.Row():
532
+ guidance_scale_input = gr.Slider(
533
+ label="Escala de Guía (CFG)",
534
+ minimum=1.0,
535
+ maximum=7.0, # Reducido para modelos gratuitos
536
+ value=3.0,
537
+ step=0.1
538
+ )
539
+ with gr.Row():
540
+ height_input = gr.Slider(
541
+ label="Altura",
542
+ value=512,
543
+ step=32,
544
+ minimum=MIN_DIM_SLIDER,
545
+ maximum=512, # Limitado para modelos gratuitos
546
+ info="Debe ser divisible por 32."
547
+ )
548
+ width_input = gr.Slider(
549
+ label="Anchura",
550
+ value=704,
551
+ step=32,
552
+ minimum=MIN_DIM_SLIDER,
553
+ maximum=768, # Limitado para modelos gratuitos
554
+ info="Debe ser divisible por 32."
555
+ )
556
+
557
+ # Manejadores de eventos
558
+ def handle_image_upload_for_dims(image_filepath, current_h, current_w):
559
+ if not image_filepath:
560
+ return gr.update(value=current_h), gr.update(value=current_w)
561
+ try:
562
+ img = Image.open(image_filepath)
563
+ orig_w, orig_h = img.size
564
+ new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
565
+ # Limitar para modelos gratuitos
566
+ new_h = min(new_h, 512)
567
+ new_w = min(new_w, 768)
568
+ return gr.update(value=new_h), gr.update(value=new_w)
569
+ except Exception as e:
570
+ print(f"Error procesando imagen: {e}")
571
+ return gr.update(value=current_h), gr.update(value=current_w)
572
+
573
+ def handle_video_upload_for_dims(video_filepath, current_h, current_w):
574
+ if not video_filepath:
575
+ return gr.update(value=current_h), gr.update(value=current_w)
576
+ try:
577
+ video_filepath_str = str(video_filepath)
578
+ if not os.path.exists(video_filepath_str):
579
+ return gr.update(value=current_h), gr.update(value=current_w)
580
+
581
+ with imageio.get_reader(video_filepath_str) as reader:
582
+ meta = reader.get_meta_data()
583
+ if 'size' in meta:
584
+ orig_w, orig_h = meta['size']
585
+ else:
586
+ first_frame = reader.get_data(0)
587
+ orig_h, orig_w = first_frame.shape[0], first_frame.shape[1]
588
+
589
+ new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
590
+ # Limitar para modelos gratuitos
591
+ new_h = min(new_h, 512)
592
+ new_w = min(new_w, 768)
593
+ return gr.update(value=new_h), gr.update(value=new_w)
594
+ except Exception as e:
595
+ print(f"Error procesando video: {e}")
596
+ return gr.update(value=current_h), gr.update(value=current_w)
597
+
598
+ # Configurar eventos
599
+ image_i2v.upload(
600
+ fn=handle_image_upload_for_dims,
601
+ inputs=[image_i2v, height_input, width_input],
602
+ outputs=[height_input, width_input]
603
+ )
604
+
605
+ video_v2v.upload(
606
+ fn=handle_video_upload_for_dims,
607
+ inputs=[video_v2v, height_input, width_input],
608
+ outputs=[height_input, width_input]
609
+ )
610
+
611
+ # Cambio de modelo
612
+ # Cambio de modelo
613
+ switch_btn.click(
614
+ fn=switch_model,
615
+ inputs=[model_selector],
616
+ outputs=[model_info]
617
+ )
618
+
619
+ # Botón: Imagen a Video
620
+ i2v_button.click(
621
+ fn=generate,
622
+ inputs=[
623
+ i2v_prompt,
624
+ negative_prompt_input,
625
+ image_i2v,
626
+ video_i_hidden,
627
+ height_input,
628
+ width_input,
629
+ mode,
630
+ duration_input,
631
+ frames_to_use,
632
+ seed_input,
633
+ randomize_seed_input,
634
+ guidance_scale_input,
635
+ improve_texture
636
+ ],
637
+ outputs=[output_video, seed_input]
638
+ )
639
+
640
+ # Botón: Texto a Video
641
+ t2v_button.click(
642
+ fn=generate,
643
+ inputs=[
644
+ t2v_prompt,
645
+ negative_prompt_input,
646
+ image_n_hidden,
647
+ video_n_hidden,
648
+ height_input,
649
+ width_input,
650
+ mode,
651
+ duration_input,
652
+ frames_to_use,
653
+ seed_input,
654
+ randomize_seed_input,
655
+ guidance_scale_input,
656
+ improve_texture
657
+ ],
658
+ outputs=[output_video, seed_input]
659
+ )
660
+
661
+ # Botón: Video a Video
662
+ v2v_button.click(
663
+ fn=generate,
664
+ inputs=[
665
+ v2v_prompt,
666
+ negative_prompt_input,
667
+ image_v_hidden,
668
+ video_v2v,
669
+ height_input,
670
+ width_input,
671
+ mode,
672
+ duration_input,
673
+ frames_to_use,
674
+ seed_input,
675
+ randomize_seed_input,
676
+ guidance_scale_input,
677
+ improve_texture
678
+ ],
679
+ outputs=[output_video, seed_input]
680
+ )
681
+
682
+ # Ejecutar la app
683
+ if __name__ == "__main__":
684
+ demo.launch()