Muhammad Taqi Raza commited on
Commit
c8296fc
·
1 Parent(s): 0659b98

adding caption text

Browse files
download/download.py CHANGED
@@ -2,22 +2,22 @@ from huggingface_hub import snapshot_download
2
 
3
  def download_model():
4
  snapshot_download(
5
- repo_id="tencent/DepthCrafter",
6
  local_dir="/app/pretrained/DepthCrafter",
7
  local_dir_use_symlinks=False,
8
  )
9
  snapshot_download(
10
- repo_id="stabilityai/stable-video-diffusion-img2vid",
11
  local_dir="/app/pretrained/stable-video-diffusion-img2vid",
12
  local_dir_use_symlinks=False,
13
  )
14
  snapshot_download(
15
- repo_id= "Qwen/Qwen2.5-VL-7B-Instruct",
16
  local_dir="/app/pretrained/Qwen2.5-VL-7B-Instruct",
17
  local_dir_use_symlinks=False,
18
  )
19
  snapshot_download(
20
- repo_id="THUDM/CogVideoX-5b-I2V",
21
  local_dir="/app/pretrained/CogVideoX-5b-I2V",
22
  local_dir_use_symlinks=False,
23
  )
 
2
 
3
  def download_model():
4
  snapshot_download(
5
+ repo_id="tencent/DepthCrafter", # 4 GB
6
  local_dir="/app/pretrained/DepthCrafter",
7
  local_dir_use_symlinks=False,
8
  )
9
  snapshot_download(
10
+ repo_id="stabilityai/stable-video-diffusion-img2vid", # 25 gb
11
  local_dir="/app/pretrained/stable-video-diffusion-img2vid",
12
  local_dir_use_symlinks=False,
13
  )
14
  snapshot_download(
15
+ repo_id= "Qwen/Qwen2.5-VL-7B-Instruct", # 16 GB
16
  local_dir="/app/pretrained/Qwen2.5-VL-7B-Instruct",
17
  local_dir_use_symlinks=False,
18
  )
19
  snapshot_download(
20
+ repo_id="THUDM/CogVideoX-5b-I2V", # 22 GB
21
  local_dir="/app/pretrained/CogVideoX-5b-I2V",
22
  local_dir_use_symlinks=False,
23
  )
gradio_app.py CHANGED
@@ -50,6 +50,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
50
  temp_input_path = "/app/temp_input.mp4"
51
  output_dir = "/app/output_anchor"
52
  video_output_path = f"{output_dir}/masked_videos/output.mp4"
 
53
 
54
  if video_path:
55
  os.system(f"cp '{video_path}' {temp_input_path}")
@@ -104,8 +105,13 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
104
  except subprocess.CalledProcessError as e:
105
  logs += f"Inference failed:\n{e.stderr}{e.stdout}"
106
  return None, logs
 
 
 
 
 
107
 
108
- return str(video_output_path), logs
109
  # -----------------------------
110
  # Step 2: Run Inference
111
  # -----------------------------
@@ -197,6 +203,7 @@ with demo:
197
  video_input = gr.Video(label="Upload Video (MP4)")
198
  step1_button = gr.Button("▶️ Run Step 1")
199
  step1_video = gr.Video(label="[Step 1] Masked Video")
 
200
  step1_logs = gr.Textbox(label="[Step 1] Logs")
201
 
202
  with gr.TabItem("Step 2: CogVideoX Refinement"):
@@ -235,9 +242,9 @@ with demo:
235
  depth_steps_input, depth_guidance_input,
236
  window_input, overlap_input, maxres_input, sample_size,
237
  seed_input, height, width, aspect_ratio_inputs,
238
- init_dx, init_dy, init_dz # ← NEW INPUTS
239
  ],
240
- outputs=[step1_video, step1_logs]
241
  )
242
 
243
  step2_button.click(
 
50
  temp_input_path = "/app/temp_input.mp4"
51
  output_dir = "/app/output_anchor"
52
  video_output_path = f"{output_dir}/masked_videos/output.mp4"
53
+ captions_text_file = f"{output_dir}/captions/output.txt"
54
 
55
  if video_path:
56
  os.system(f"cp '{video_path}' {temp_input_path}")
 
105
  except subprocess.CalledProcessError as e:
106
  logs += f"Inference failed:\n{e.stderr}{e.stdout}"
107
  return None, logs
108
+
109
+ caption_text = ""
110
+ if os.path.exists(captions_text_file):
111
+ with open(captions_text_file, "r") as f:
112
+ caption_text = f.read()
113
 
114
+ return str(video_output_path), logs, caption_text
115
  # -----------------------------
116
  # Step 2: Run Inference
117
  # -----------------------------
 
203
  video_input = gr.Video(label="Upload Video (MP4)")
204
  step1_button = gr.Button("▶️ Run Step 1")
205
  step1_video = gr.Video(label="[Step 1] Masked Video")
206
+ step1_captions = gr.Textbox(label="[Step 1] Captions", lines=4)
207
  step1_logs = gr.Textbox(label="[Step 1] Logs")
208
 
209
  with gr.TabItem("Step 2: CogVideoX Refinement"):
 
242
  depth_steps_input, depth_guidance_input,
243
  window_input, overlap_input, maxres_input, sample_size,
244
  seed_input, height, width, aspect_ratio_inputs,
245
+ init_dx, init_dy, init_dz
246
  ],
247
+ outputs=[step1_video, step1_logs, step1_captions] # ← updated here
248
  )
249
 
250
  step2_button.click(
inference/v2v_data/inference.py CHANGED
@@ -4,7 +4,6 @@ from datetime import datetime
4
  import argparse
5
  import torch
6
 
7
-
8
  def get_parser():
9
  parser = argparse.ArgumentParser()
10
 
@@ -78,7 +77,7 @@ def get_parser():
78
  parser.add_argument(
79
  '--width', type=int, default=1920, help='width'
80
  )
81
-
82
 
83
  parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
84
  parser.add_argument(
 
4
  import argparse
5
  import torch
6
 
 
7
  def get_parser():
8
  parser = argparse.ArgumentParser()
9
 
 
77
  parser.add_argument(
78
  '--width', type=int, default=1920, help='width'
79
  )
80
+
81
 
82
  parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
83
  parser.add_argument(