Spaces:
Running
on
Zero
Running
on
Zero
init commit
Browse files- .gitattributes +1 -0
- app.py +90 -0
- examples/1.mp4 +3 -0
- examples/2.mp4 +3 -0
- examples/3.mp4 +3 -0
- examples/4.mp4 +3 -0
- fusion_caption.py +184 -0
- struct_caption.py +61 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
examples/*.mp4 filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from struct_caption import StructCaptioner
|
5 |
+
from fusion_caption import FusionCaptioner
|
6 |
+
|
7 |
+
struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
|
8 |
+
fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.Markdown(
|
12 |
+
"""
|
13 |
+
<h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
|
14 |
+
""",
|
15 |
+
elem_id="header"
|
16 |
+
)
|
17 |
+
|
18 |
+
with gr.Row():
|
19 |
+
with gr.Column(scale=0.5):
|
20 |
+
video_input = gr.Video(
|
21 |
+
label="Upload Video",
|
22 |
+
interactive=True,
|
23 |
+
format="mp4",
|
24 |
+
)
|
25 |
+
|
26 |
+
btn_struct = gr.Button("Generate Struct Caption")
|
27 |
+
|
28 |
+
with gr.Column():
|
29 |
+
struct_caption_output = gr.Code(
|
30 |
+
label="Struct Caption",
|
31 |
+
language="json",
|
32 |
+
lines=25,
|
33 |
+
interactive=False
|
34 |
+
)
|
35 |
+
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column(scale=0.5):
|
38 |
+
with gr.Row():
|
39 |
+
task_input = gr.Radio(
|
40 |
+
label="Task Type",
|
41 |
+
choices=["t2v", "i2v"],
|
42 |
+
value="t2v",
|
43 |
+
interactive=True
|
44 |
+
)
|
45 |
+
btn_fusion = gr.Button("Generate Fusion Caption")
|
46 |
+
|
47 |
+
with gr.Column():
|
48 |
+
fusion_caption_output = gr.Textbox(
|
49 |
+
label="Fusion Caption",
|
50 |
+
value="",
|
51 |
+
interactive=False
|
52 |
+
)
|
53 |
+
|
54 |
+
@spaces.GPU(duration=120)
|
55 |
+
def generate_struct_caption(video):
|
56 |
+
struct_caption = struct_captioner(video)
|
57 |
+
return struct_caption
|
58 |
+
|
59 |
+
@spaces.GPU(duration=120)
|
60 |
+
def generate_fusion_caption(struct_caption_str, task):
|
61 |
+
return fusion_captioner(struct_caption_str, task)
|
62 |
+
|
63 |
+
btn_struct.click(
|
64 |
+
fn=generate_struct_caption,
|
65 |
+
inputs=video_input,
|
66 |
+
outputs=struct_caption_output
|
67 |
+
)
|
68 |
+
|
69 |
+
btn_fusion.click(
|
70 |
+
fn=generate_fusion_caption,
|
71 |
+
inputs=[struct_caption_output, task_input],
|
72 |
+
outputs=fusion_caption_output
|
73 |
+
)
|
74 |
+
|
75 |
+
gr.Examples(
|
76 |
+
examples=[
|
77 |
+
["./examples/1.mp4"],
|
78 |
+
["./examples/2.mp4"],
|
79 |
+
["./examples/3.mp4"],
|
80 |
+
["./examples/4.mp4"],
|
81 |
+
],
|
82 |
+
inputs=video_input,
|
83 |
+
label="Example Videos"
|
84 |
+
)
|
85 |
+
|
86 |
+
demo.launch(
|
87 |
+
server_name="0.0.0.0",
|
88 |
+
server_port=7872,
|
89 |
+
share=False
|
90 |
+
)
|
examples/1.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16becc00811e427d9e3f5da5a977239907ea3a6d45b5482b9bcfea2abc3c6b7f
|
3 |
+
size 4821469
|
examples/2.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:109b954b7b1e36addef840554af53241415492e71972c87b7bfb03f77bf0d68a
|
3 |
+
size 1030652
|
examples/3.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd544787b24265605cf16c954f495fe9d73680e0529f1438e47c02803a9c2bf
|
3 |
+
size 1661366
|
examples/4.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5cc4038924aae963a54779d713f7f9680259ae81a4446fb2775cc6bb51d907c
|
3 |
+
size 2523332
|
fusion_caption.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
+
|
8 |
+
SYSTEM_PROMPT_I2V = """
|
9 |
+
You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
|
10 |
+
|
11 |
+
## Structured Input
|
12 |
+
{structured_input}
|
13 |
+
|
14 |
+
## Notes
|
15 |
+
1. If there has an empty field, just ignore it and do not mention it in the output.
|
16 |
+
2. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
|
17 |
+
3. If the action field is not empty, eliminate the irrelevant information in the action field that is not related to the timing action(such as wearings, background and environment information) to make a pure action field.
|
18 |
+
|
19 |
+
## Output Principles and Orders
|
20 |
+
1. First, eliminate the static information in the action field that is not related to the timing action, such as background or environment information.
|
21 |
+
2. Second, describe each subject with its pure action and expression if these fields exist.
|
22 |
+
|
23 |
+
## Output
|
24 |
+
Please directly output the final composed caption without any additional information.
|
25 |
+
"""
|
26 |
+
|
27 |
+
SYSTEM_PROMPT_T2V = """
|
28 |
+
You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
|
29 |
+
|
30 |
+
## Structured Input
|
31 |
+
{structured_input}
|
32 |
+
|
33 |
+
## Notes
|
34 |
+
1. According to the action field information, change its name field to the subject pronoun in the action.
|
35 |
+
2. If there has an empty field, just ignore it and do not mention it in the output.
|
36 |
+
3. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
|
37 |
+
|
38 |
+
## Output Principles and Orders
|
39 |
+
1. First, declare the shot_type, then declare the shot_angle and the shot_position fields in natural and fluent.
|
40 |
+
2. Second, eliminate information in the action field that is not related to the timing action, such as background or environment information if action is not empty.
|
41 |
+
3. Third, describe each subject with its pure action, appearance, expression, position if these fields exist.
|
42 |
+
4. Finally, declare the environment and lighting if the environment and lighting fields are not empty.
|
43 |
+
|
44 |
+
## Output
|
45 |
+
Please directly output the final composed caption without any additional information.
|
46 |
+
"""
|
47 |
+
|
48 |
+
|
49 |
+
class StructuralCaptionDataset(torch.utils.data.Dataset):
|
50 |
+
def __init__(self, input_csv, model_path, task):
|
51 |
+
if isinstance(input_csv, pd.DataFrame):
|
52 |
+
self.meta = input_csv
|
53 |
+
else:
|
54 |
+
self.meta = pd.read_csv(input_csv)
|
55 |
+
self.task = task
|
56 |
+
self.system_prompt = SYSTEM_PROMPT_T2V if self.task == 't2v' else SYSTEM_PROMPT_I2V
|
57 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
58 |
+
|
59 |
+
def __len__(self):
|
60 |
+
return len(self.meta)
|
61 |
+
|
62 |
+
def __getitem__(self, index):
|
63 |
+
row = self.meta.iloc[index]
|
64 |
+
real_index = self.meta.index[index]
|
65 |
+
|
66 |
+
struct_caption = json.loads(row["structural_caption"])
|
67 |
+
|
68 |
+
camera_movement = struct_caption.get('camera_motion', '')
|
69 |
+
if camera_movement != '':
|
70 |
+
camera_movement += '.'
|
71 |
+
camera_movement = camera_movement.capitalize()
|
72 |
+
|
73 |
+
fusion_by_llm = False
|
74 |
+
cleaned_struct_caption = self.clean_struct_caption(struct_caption, self.task)
|
75 |
+
if cleaned_struct_caption.get('num_subjects', 0) > 0:
|
76 |
+
new_struct_caption = json.dumps(cleaned_struct_caption, indent=4, ensure_ascii=False)
|
77 |
+
conversation = [
|
78 |
+
{
|
79 |
+
"role": "user",
|
80 |
+
"content": self.system_prompt.format(structured_input=new_struct_caption),
|
81 |
+
},
|
82 |
+
]
|
83 |
+
text = self.tokenizer.apply_chat_template(
|
84 |
+
conversation,
|
85 |
+
tokenize=False,
|
86 |
+
add_generation_prompt=True,
|
87 |
+
enable_thinking=False,
|
88 |
+
)
|
89 |
+
fusion_by_llm = True
|
90 |
+
else:
|
91 |
+
text = '-'
|
92 |
+
return real_index, fusion_by_llm, text, '-', camera_movement
|
93 |
+
|
94 |
+
def clean_struct_caption(self, struct_caption, task):
|
95 |
+
raw_subjects = struct_caption.get('subjects', [])
|
96 |
+
subjects = []
|
97 |
+
for subject in raw_subjects:
|
98 |
+
subject_type = subject.get("TYPES", {}).get('type', '')
|
99 |
+
subject_sub_type = subject.get("TYPES", {}).get('sub_type', '')
|
100 |
+
if subject_type not in ["Human", "Animal"]:
|
101 |
+
subject['expression'] = ''
|
102 |
+
if subject_type == 'Human' and subject_sub_type == 'Accessory':
|
103 |
+
subject['expression'] = ''
|
104 |
+
if subject_sub_type != '':
|
105 |
+
subject['name'] = subject_sub_type
|
106 |
+
if 'TYPES' in subject:
|
107 |
+
del subject['TYPES']
|
108 |
+
if 'is_main_subject' in subject:
|
109 |
+
del subject['is_main_subject']
|
110 |
+
subjects.append(subject)
|
111 |
+
|
112 |
+
to_del_subject_ids = []
|
113 |
+
for idx, subject in enumerate(subjects):
|
114 |
+
action = subject.get('action', '').strip()
|
115 |
+
subject['action'] = action
|
116 |
+
if random.random() > 0.9 and 'appearance' in subject:
|
117 |
+
del subject['appearance']
|
118 |
+
if random.random() > 0.9 and 'position' in subject:
|
119 |
+
del subject['position']
|
120 |
+
if task == 'i2v':
|
121 |
+
# just keep name and action, expression in subjects
|
122 |
+
dropped_keys = ['appearance', 'position']
|
123 |
+
for key in dropped_keys:
|
124 |
+
if key in subject:
|
125 |
+
del subject[key]
|
126 |
+
if subject['action'] == '' and ('expression' not in subject or subject['expression'] == ''):
|
127 |
+
to_del_subject_ids.append(idx)
|
128 |
+
|
129 |
+
# delete the subjects according to the to_del_subject_ids
|
130 |
+
for idx in sorted(to_del_subject_ids, reverse=True):
|
131 |
+
del subjects[idx]
|
132 |
+
|
133 |
+
new_struct_caption = {
|
134 |
+
'num_subjects': len(subjects),
|
135 |
+
'subjects': subjects,
|
136 |
+
'shot_type': struct_caption.get('shot_type', ''),
|
137 |
+
'shot_angle': struct_caption.get('shot_angle', ''),
|
138 |
+
'shot_position': struct_caption.get('shot_position', ''),
|
139 |
+
'environment': struct_caption.get('environment', ''),
|
140 |
+
'lighting': struct_caption.get('lighting', ''),
|
141 |
+
}
|
142 |
+
|
143 |
+
if task == 't2v' and random.random() > 0.9:
|
144 |
+
del new_struct_caption['lighting']
|
145 |
+
|
146 |
+
if task == 'i2v':
|
147 |
+
drop_keys = ['environment', 'lighting', 'shot_type', 'shot_angle', 'shot_position']
|
148 |
+
for drop_key in drop_keys:
|
149 |
+
del new_struct_caption[drop_key]
|
150 |
+
return new_struct_caption
|
151 |
+
|
152 |
+
|
153 |
+
class FusionCaptioner:
|
154 |
+
def __init__(self, model_path):
|
155 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
156 |
+
model_path,
|
157 |
+
torch_dtype="auto",
|
158 |
+
device_map="cuda",
|
159 |
+
)
|
160 |
+
|
161 |
+
self.model_path = model_path
|
162 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
163 |
+
|
164 |
+
def __call__(self, structural_caption, task='t2v'):
|
165 |
+
if isinstance(structural_caption, dict):
|
166 |
+
structural_caption = json.dumps(structural_caption, ensure_ascii=False)
|
167 |
+
else:
|
168 |
+
structural_caption = json.dumps(json.loads(structural_caption), ensure_ascii=False)
|
169 |
+
meta = pd.DataFrame([structural_caption], columns=['structural_caption'])
|
170 |
+
dataset = StructuralCaptionDataset(meta, self.model_path, task)
|
171 |
+
_, fusion_by_llm, text, original_text, camera_movement = dataset[0]
|
172 |
+
if not fusion_by_llm:
|
173 |
+
caption = original_text + " " + camera_movement
|
174 |
+
return caption
|
175 |
+
|
176 |
+
model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
|
177 |
+
generated_ids = self.model.generate(**model_inputs, max_new_tokens=1024, temperature=0.1)
|
178 |
+
generated_ids = [
|
179 |
+
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
180 |
+
]
|
181 |
+
result = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
182 |
+
|
183 |
+
llm_caption = result + " " + camera_movement
|
184 |
+
return llm_caption
|
struct_caption.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
5 |
+
from qwen_vl_utils import process_vision_info
|
6 |
+
|
7 |
+
SYSTEM_PROMPT = "I need you to generate a structured and detailed caption for the provided video. The structured output and the requirements for each field are as shown in the following JSON content: {\"subjects\": [{\"appearance\": \"Main subject appearance description\", \"action\": \"Main subject action\", \"expression\": \"Main subject expression (Only for human/animal categories, empty otherwise)\", \"position\": \"Subject position in the video (Can be relative position to other objects or spatial description)\", \"TYPES\": {\"type\": \"Main category (e.g., Human)\", \"sub_type\": \"Sub-category (e.g., Man)\"}, \"is_main_subject\": true}, {\"appearance\": \"Non-main subject appearance description\", \"action\": \"Non-main subject action\", \"expression\": \"Non-main subject expression (Only for human/animal categories, empty otherwise)\", \"position\": \"Position of non-main subject 1\", \"TYPES\": {\"type\": \"Main category (e.g., Vehicles)\", \"sub_type\": \"Sub-category (e.g., Ship)\"}, \"is_main_subject\": false}], \"shot_type\": \"Shot type(Options: long_shot/full_shot/medium_shot/close_up/extreme_close_up/other)\", \"shot_angle\": \"Camera angle(Options: eye_level/high_angle/low_angle/other)\", \"shot_position\": \"Camera position(Options: front_view/back_view/side_view/over_the_shoulder/overhead_view/point_of_view/aerial_view/overlooking_view/other)\", \"camera_motion\": \"Camera movement description\", \"environment\": \"Video background/environment description\", \"lighting\": \"Lighting information in the video\"}"
|
8 |
+
|
9 |
+
class StructCaptioner:
|
10 |
+
def __init__(self, model_path):
|
11 |
+
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
12 |
+
model_path,
|
13 |
+
torch_dtype=torch.bfloat16,
|
14 |
+
# attn_implementation="flash_attention_2",
|
15 |
+
device_map="cuda",
|
16 |
+
)
|
17 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
18 |
+
|
19 |
+
def __call__(self, video_path):
|
20 |
+
messages = [
|
21 |
+
{
|
22 |
+
"role": "user",
|
23 |
+
"content": [
|
24 |
+
{
|
25 |
+
"type": "video",
|
26 |
+
"video": video_path,
|
27 |
+
"max_pixels": 360 * 420,
|
28 |
+
"fps": 2.0,
|
29 |
+
},
|
30 |
+
{"type": "text", "text": SYSTEM_PROMPT},
|
31 |
+
],
|
32 |
+
}
|
33 |
+
]
|
34 |
+
|
35 |
+
text = self.processor.apply_chat_template(
|
36 |
+
messages, tokenize=False, add_generation_prompt=True
|
37 |
+
)
|
38 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
39 |
+
|
40 |
+
inputs = self.processor(
|
41 |
+
text=[text],
|
42 |
+
images=image_inputs,
|
43 |
+
videos=video_inputs,
|
44 |
+
padding=True,
|
45 |
+
return_tensors="pt",
|
46 |
+
**video_kwargs,
|
47 |
+
)
|
48 |
+
|
49 |
+
inputs = inputs.to("cuda")
|
50 |
+
|
51 |
+
generated_ids = self.model.generate(**inputs, max_new_tokens=2048, temperature=0.05)
|
52 |
+
generated_ids_trimmed = [
|
53 |
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
54 |
+
]
|
55 |
+
output_texts = self.processor.batch_decode(
|
56 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
57 |
+
)
|
58 |
+
caption = json.loads(output_texts[0])
|
59 |
+
caption = json.dumps(caption, indent=4, ensure_ascii=False)
|
60 |
+
return caption
|
61 |
+
|