Spaces:
Paused
Paused
Muhammad Taqi Raza
commited on
Commit
·
c8296fc
1
Parent(s):
0659b98
adding caption text
Browse files- download/download.py +4 -4
- gradio_app.py +10 -3
- inference/v2v_data/inference.py +1 -2
download/download.py
CHANGED
@@ -2,22 +2,22 @@ from huggingface_hub import snapshot_download
|
|
2 |
|
3 |
def download_model():
|
4 |
snapshot_download(
|
5 |
-
repo_id="tencent/DepthCrafter",
|
6 |
local_dir="/app/pretrained/DepthCrafter",
|
7 |
local_dir_use_symlinks=False,
|
8 |
)
|
9 |
snapshot_download(
|
10 |
-
repo_id="stabilityai/stable-video-diffusion-img2vid",
|
11 |
local_dir="/app/pretrained/stable-video-diffusion-img2vid",
|
12 |
local_dir_use_symlinks=False,
|
13 |
)
|
14 |
snapshot_download(
|
15 |
-
repo_id= "Qwen/Qwen2.5-VL-7B-Instruct",
|
16 |
local_dir="/app/pretrained/Qwen2.5-VL-7B-Instruct",
|
17 |
local_dir_use_symlinks=False,
|
18 |
)
|
19 |
snapshot_download(
|
20 |
-
repo_id="THUDM/CogVideoX-5b-I2V",
|
21 |
local_dir="/app/pretrained/CogVideoX-5b-I2V",
|
22 |
local_dir_use_symlinks=False,
|
23 |
)
|
|
|
2 |
|
3 |
def download_model():
|
4 |
snapshot_download(
|
5 |
+
repo_id="tencent/DepthCrafter", # 4 GB
|
6 |
local_dir="/app/pretrained/DepthCrafter",
|
7 |
local_dir_use_symlinks=False,
|
8 |
)
|
9 |
snapshot_download(
|
10 |
+
repo_id="stabilityai/stable-video-diffusion-img2vid", # 25 gb
|
11 |
local_dir="/app/pretrained/stable-video-diffusion-img2vid",
|
12 |
local_dir_use_symlinks=False,
|
13 |
)
|
14 |
snapshot_download(
|
15 |
+
repo_id= "Qwen/Qwen2.5-VL-7B-Instruct", # 16 GB
|
16 |
local_dir="/app/pretrained/Qwen2.5-VL-7B-Instruct",
|
17 |
local_dir_use_symlinks=False,
|
18 |
)
|
19 |
snapshot_download(
|
20 |
+
repo_id="THUDM/CogVideoX-5b-I2V", # 22 GB
|
21 |
local_dir="/app/pretrained/CogVideoX-5b-I2V",
|
22 |
local_dir_use_symlinks=False,
|
23 |
)
|
gradio_app.py
CHANGED
@@ -50,6 +50,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
50 |
temp_input_path = "/app/temp_input.mp4"
|
51 |
output_dir = "/app/output_anchor"
|
52 |
video_output_path = f"{output_dir}/masked_videos/output.mp4"
|
|
|
53 |
|
54 |
if video_path:
|
55 |
os.system(f"cp '{video_path}' {temp_input_path}")
|
@@ -104,8 +105,13 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
104 |
except subprocess.CalledProcessError as e:
|
105 |
logs += f"Inference failed:\n{e.stderr}{e.stdout}"
|
106 |
return None, logs
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
return str(video_output_path), logs
|
109 |
# -----------------------------
|
110 |
# Step 2: Run Inference
|
111 |
# -----------------------------
|
@@ -197,6 +203,7 @@ with demo:
|
|
197 |
video_input = gr.Video(label="Upload Video (MP4)")
|
198 |
step1_button = gr.Button("▶️ Run Step 1")
|
199 |
step1_video = gr.Video(label="[Step 1] Masked Video")
|
|
|
200 |
step1_logs = gr.Textbox(label="[Step 1] Logs")
|
201 |
|
202 |
with gr.TabItem("Step 2: CogVideoX Refinement"):
|
@@ -235,9 +242,9 @@ with demo:
|
|
235 |
depth_steps_input, depth_guidance_input,
|
236 |
window_input, overlap_input, maxres_input, sample_size,
|
237 |
seed_input, height, width, aspect_ratio_inputs,
|
238 |
-
init_dx, init_dy, init_dz
|
239 |
],
|
240 |
-
outputs=[step1_video, step1_logs]
|
241 |
)
|
242 |
|
243 |
step2_button.click(
|
|
|
50 |
temp_input_path = "/app/temp_input.mp4"
|
51 |
output_dir = "/app/output_anchor"
|
52 |
video_output_path = f"{output_dir}/masked_videos/output.mp4"
|
53 |
+
captions_text_file = f"{output_dir}/captions/output.txt"
|
54 |
|
55 |
if video_path:
|
56 |
os.system(f"cp '{video_path}' {temp_input_path}")
|
|
|
105 |
except subprocess.CalledProcessError as e:
|
106 |
logs += f"Inference failed:\n{e.stderr}{e.stdout}"
|
107 |
return None, logs
|
108 |
+
|
109 |
+
caption_text = ""
|
110 |
+
if os.path.exists(captions_text_file):
|
111 |
+
with open(captions_text_file, "r") as f:
|
112 |
+
caption_text = f.read()
|
113 |
|
114 |
+
return str(video_output_path), logs, caption_text
|
115 |
# -----------------------------
|
116 |
# Step 2: Run Inference
|
117 |
# -----------------------------
|
|
|
203 |
video_input = gr.Video(label="Upload Video (MP4)")
|
204 |
step1_button = gr.Button("▶️ Run Step 1")
|
205 |
step1_video = gr.Video(label="[Step 1] Masked Video")
|
206 |
+
step1_captions = gr.Textbox(label="[Step 1] Captions", lines=4)
|
207 |
step1_logs = gr.Textbox(label="[Step 1] Logs")
|
208 |
|
209 |
with gr.TabItem("Step 2: CogVideoX Refinement"):
|
|
|
242 |
depth_steps_input, depth_guidance_input,
|
243 |
window_input, overlap_input, maxres_input, sample_size,
|
244 |
seed_input, height, width, aspect_ratio_inputs,
|
245 |
+
init_dx, init_dy, init_dz
|
246 |
],
|
247 |
+
outputs=[step1_video, step1_logs, step1_captions] # ← updated here
|
248 |
)
|
249 |
|
250 |
step2_button.click(
|
inference/v2v_data/inference.py
CHANGED
@@ -4,7 +4,6 @@ from datetime import datetime
|
|
4 |
import argparse
|
5 |
import torch
|
6 |
|
7 |
-
|
8 |
def get_parser():
|
9 |
parser = argparse.ArgumentParser()
|
10 |
|
@@ -78,7 +77,7 @@ def get_parser():
|
|
78 |
parser.add_argument(
|
79 |
'--width', type=int, default=1920, help='width'
|
80 |
)
|
81 |
-
|
82 |
|
83 |
parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
|
84 |
parser.add_argument(
|
|
|
4 |
import argparse
|
5 |
import torch
|
6 |
|
|
|
7 |
def get_parser():
|
8 |
parser = argparse.ArgumentParser()
|
9 |
|
|
|
77 |
parser.add_argument(
|
78 |
'--width', type=int, default=1920, help='width'
|
79 |
)
|
80 |
+
|
81 |
|
82 |
parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
|
83 |
parser.add_argument(
|