pinoo commited on
Commit
0d1c12c
·
1 Parent(s): 8397d3d

init commit

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. app.py +90 -0
  3. examples/1.mp4 +3 -0
  4. examples/2.mp4 +3 -0
  5. examples/3.mp4 +3 -0
  6. examples/4.mp4 +3 -0
  7. fusion_caption.py +184 -0
  8. struct_caption.py +61 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/*.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+
4
+ from struct_caption import StructCaptioner
5
+ from fusion_caption import FusionCaptioner
6
+
7
+ struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
8
+ fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(
12
+ """
13
+ <h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
14
+ """,
15
+ elem_id="header"
16
+ )
17
+
18
+ with gr.Row():
19
+ with gr.Column(scale=0.5):
20
+ video_input = gr.Video(
21
+ label="Upload Video",
22
+ interactive=True,
23
+ format="mp4",
24
+ )
25
+
26
+ btn_struct = gr.Button("Generate Struct Caption")
27
+
28
+ with gr.Column():
29
+ struct_caption_output = gr.Code(
30
+ label="Struct Caption",
31
+ language="json",
32
+ lines=25,
33
+ interactive=False
34
+ )
35
+
36
+ with gr.Row():
37
+ with gr.Column(scale=0.5):
38
+ with gr.Row():
39
+ task_input = gr.Radio(
40
+ label="Task Type",
41
+ choices=["t2v", "i2v"],
42
+ value="t2v",
43
+ interactive=True
44
+ )
45
+ btn_fusion = gr.Button("Generate Fusion Caption")
46
+
47
+ with gr.Column():
48
+ fusion_caption_output = gr.Textbox(
49
+ label="Fusion Caption",
50
+ value="",
51
+ interactive=False
52
+ )
53
+
54
+ @spaces.GPU(duration=120)
55
+ def generate_struct_caption(video):
56
+ struct_caption = struct_captioner(video)
57
+ return struct_caption
58
+
59
+ @spaces.GPU(duration=120)
60
+ def generate_fusion_caption(struct_caption_str, task):
61
+ return fusion_captioner(struct_caption_str, task)
62
+
63
+ btn_struct.click(
64
+ fn=generate_struct_caption,
65
+ inputs=video_input,
66
+ outputs=struct_caption_output
67
+ )
68
+
69
+ btn_fusion.click(
70
+ fn=generate_fusion_caption,
71
+ inputs=[struct_caption_output, task_input],
72
+ outputs=fusion_caption_output
73
+ )
74
+
75
+ gr.Examples(
76
+ examples=[
77
+ ["./examples/1.mp4"],
78
+ ["./examples/2.mp4"],
79
+ ["./examples/3.mp4"],
80
+ ["./examples/4.mp4"],
81
+ ],
82
+ inputs=video_input,
83
+ label="Example Videos"
84
+ )
85
+
86
+ demo.launch(
87
+ server_name="0.0.0.0",
88
+ server_port=7872,
89
+ share=False
90
+ )
examples/1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16becc00811e427d9e3f5da5a977239907ea3a6d45b5482b9bcfea2abc3c6b7f
3
+ size 4821469
examples/2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109b954b7b1e36addef840554af53241415492e71972c87b7bfb03f77bf0d68a
3
+ size 1030652
examples/3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd544787b24265605cf16c954f495fe9d73680e0529f1438e47c02803a9c2bf
3
+ size 1661366
examples/4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5cc4038924aae963a54779d713f7f9680259ae81a4446fb2775cc6bb51d907c
3
+ size 2523332
fusion_caption.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import pandas as pd
4
+
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ SYSTEM_PROMPT_I2V = """
9
+ You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
10
+
11
+ ## Structured Input
12
+ {structured_input}
13
+
14
+ ## Notes
15
+ 1. If there has an empty field, just ignore it and do not mention it in the output.
16
+ 2. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
17
+ 3. If the action field is not empty, eliminate the irrelevant information in the action field that is not related to the timing action(such as wearings, background and environment information) to make a pure action field.
18
+
19
+ ## Output Principles and Orders
20
+ 1. First, eliminate the static information in the action field that is not related to the timing action, such as background or environment information.
21
+ 2. Second, describe each subject with its pure action and expression if these fields exist.
22
+
23
+ ## Output
24
+ Please directly output the final composed caption without any additional information.
25
+ """
26
+
27
+ SYSTEM_PROMPT_T2V = """
28
+ You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
29
+
30
+ ## Structured Input
31
+ {structured_input}
32
+
33
+ ## Notes
34
+ 1. According to the action field information, change its name field to the subject pronoun in the action.
35
+ 2. If there has an empty field, just ignore it and do not mention it in the output.
36
+ 3. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
37
+
38
+ ## Output Principles and Orders
39
+ 1. First, declare the shot_type, then declare the shot_angle and the shot_position fields in natural and fluent.
40
+ 2. Second, eliminate information in the action field that is not related to the timing action, such as background or environment information if action is not empty.
41
+ 3. Third, describe each subject with its pure action, appearance, expression, position if these fields exist.
42
+ 4. Finally, declare the environment and lighting if the environment and lighting fields are not empty.
43
+
44
+ ## Output
45
+ Please directly output the final composed caption without any additional information.
46
+ """
47
+
48
+
49
+ class StructuralCaptionDataset(torch.utils.data.Dataset):
50
+ def __init__(self, input_csv, model_path, task):
51
+ if isinstance(input_csv, pd.DataFrame):
52
+ self.meta = input_csv
53
+ else:
54
+ self.meta = pd.read_csv(input_csv)
55
+ self.task = task
56
+ self.system_prompt = SYSTEM_PROMPT_T2V if self.task == 't2v' else SYSTEM_PROMPT_I2V
57
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
58
+
59
+ def __len__(self):
60
+ return len(self.meta)
61
+
62
+ def __getitem__(self, index):
63
+ row = self.meta.iloc[index]
64
+ real_index = self.meta.index[index]
65
+
66
+ struct_caption = json.loads(row["structural_caption"])
67
+
68
+ camera_movement = struct_caption.get('camera_motion', '')
69
+ if camera_movement != '':
70
+ camera_movement += '.'
71
+ camera_movement = camera_movement.capitalize()
72
+
73
+ fusion_by_llm = False
74
+ cleaned_struct_caption = self.clean_struct_caption(struct_caption, self.task)
75
+ if cleaned_struct_caption.get('num_subjects', 0) > 0:
76
+ new_struct_caption = json.dumps(cleaned_struct_caption, indent=4, ensure_ascii=False)
77
+ conversation = [
78
+ {
79
+ "role": "user",
80
+ "content": self.system_prompt.format(structured_input=new_struct_caption),
81
+ },
82
+ ]
83
+ text = self.tokenizer.apply_chat_template(
84
+ conversation,
85
+ tokenize=False,
86
+ add_generation_prompt=True,
87
+ enable_thinking=False,
88
+ )
89
+ fusion_by_llm = True
90
+ else:
91
+ text = '-'
92
+ return real_index, fusion_by_llm, text, '-', camera_movement
93
+
94
+ def clean_struct_caption(self, struct_caption, task):
95
+ raw_subjects = struct_caption.get('subjects', [])
96
+ subjects = []
97
+ for subject in raw_subjects:
98
+ subject_type = subject.get("TYPES", {}).get('type', '')
99
+ subject_sub_type = subject.get("TYPES", {}).get('sub_type', '')
100
+ if subject_type not in ["Human", "Animal"]:
101
+ subject['expression'] = ''
102
+ if subject_type == 'Human' and subject_sub_type == 'Accessory':
103
+ subject['expression'] = ''
104
+ if subject_sub_type != '':
105
+ subject['name'] = subject_sub_type
106
+ if 'TYPES' in subject:
107
+ del subject['TYPES']
108
+ if 'is_main_subject' in subject:
109
+ del subject['is_main_subject']
110
+ subjects.append(subject)
111
+
112
+ to_del_subject_ids = []
113
+ for idx, subject in enumerate(subjects):
114
+ action = subject.get('action', '').strip()
115
+ subject['action'] = action
116
+ if random.random() > 0.9 and 'appearance' in subject:
117
+ del subject['appearance']
118
+ if random.random() > 0.9 and 'position' in subject:
119
+ del subject['position']
120
+ if task == 'i2v':
121
+ # just keep name and action, expression in subjects
122
+ dropped_keys = ['appearance', 'position']
123
+ for key in dropped_keys:
124
+ if key in subject:
125
+ del subject[key]
126
+ if subject['action'] == '' and ('expression' not in subject or subject['expression'] == ''):
127
+ to_del_subject_ids.append(idx)
128
+
129
+ # delete the subjects according to the to_del_subject_ids
130
+ for idx in sorted(to_del_subject_ids, reverse=True):
131
+ del subjects[idx]
132
+
133
+ new_struct_caption = {
134
+ 'num_subjects': len(subjects),
135
+ 'subjects': subjects,
136
+ 'shot_type': struct_caption.get('shot_type', ''),
137
+ 'shot_angle': struct_caption.get('shot_angle', ''),
138
+ 'shot_position': struct_caption.get('shot_position', ''),
139
+ 'environment': struct_caption.get('environment', ''),
140
+ 'lighting': struct_caption.get('lighting', ''),
141
+ }
142
+
143
+ if task == 't2v' and random.random() > 0.9:
144
+ del new_struct_caption['lighting']
145
+
146
+ if task == 'i2v':
147
+ drop_keys = ['environment', 'lighting', 'shot_type', 'shot_angle', 'shot_position']
148
+ for drop_key in drop_keys:
149
+ del new_struct_caption[drop_key]
150
+ return new_struct_caption
151
+
152
+
153
+ class FusionCaptioner:
154
+ def __init__(self, model_path):
155
+ self.model = AutoModelForCausalLM.from_pretrained(
156
+ model_path,
157
+ torch_dtype="auto",
158
+ device_map="cuda",
159
+ )
160
+
161
+ self.model_path = model_path
162
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
163
+
164
+ def __call__(self, structural_caption, task='t2v'):
165
+ if isinstance(structural_caption, dict):
166
+ structural_caption = json.dumps(structural_caption, ensure_ascii=False)
167
+ else:
168
+ structural_caption = json.dumps(json.loads(structural_caption), ensure_ascii=False)
169
+ meta = pd.DataFrame([structural_caption], columns=['structural_caption'])
170
+ dataset = StructuralCaptionDataset(meta, self.model_path, task)
171
+ _, fusion_by_llm, text, original_text, camera_movement = dataset[0]
172
+ if not fusion_by_llm:
173
+ caption = original_text + " " + camera_movement
174
+ return caption
175
+
176
+ model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
177
+ generated_ids = self.model.generate(**model_inputs, max_new_tokens=1024, temperature=0.1)
178
+ generated_ids = [
179
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
180
+ ]
181
+ result = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
182
+
183
+ llm_caption = result + " " + camera_movement
184
+ return llm_caption
struct_caption.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+
4
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
5
+ from qwen_vl_utils import process_vision_info
6
+
7
+ SYSTEM_PROMPT = "I need you to generate a structured and detailed caption for the provided video. The structured output and the requirements for each field are as shown in the following JSON content: {\"subjects\": [{\"appearance\": \"Main subject appearance description\", \"action\": \"Main subject action\", \"expression\": \"Main subject expression (Only for human/animal categories, empty otherwise)\", \"position\": \"Subject position in the video (Can be relative position to other objects or spatial description)\", \"TYPES\": {\"type\": \"Main category (e.g., Human)\", \"sub_type\": \"Sub-category (e.g., Man)\"}, \"is_main_subject\": true}, {\"appearance\": \"Non-main subject appearance description\", \"action\": \"Non-main subject action\", \"expression\": \"Non-main subject expression (Only for human/animal categories, empty otherwise)\", \"position\": \"Position of non-main subject 1\", \"TYPES\": {\"type\": \"Main category (e.g., Vehicles)\", \"sub_type\": \"Sub-category (e.g., Ship)\"}, \"is_main_subject\": false}], \"shot_type\": \"Shot type(Options: long_shot/full_shot/medium_shot/close_up/extreme_close_up/other)\", \"shot_angle\": \"Camera angle(Options: eye_level/high_angle/low_angle/other)\", \"shot_position\": \"Camera position(Options: front_view/back_view/side_view/over_the_shoulder/overhead_view/point_of_view/aerial_view/overlooking_view/other)\", \"camera_motion\": \"Camera movement description\", \"environment\": \"Video background/environment description\", \"lighting\": \"Lighting information in the video\"}"
8
+
9
+ class StructCaptioner:
10
+ def __init__(self, model_path):
11
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
+ model_path,
13
+ torch_dtype=torch.bfloat16,
14
+ # attn_implementation="flash_attention_2",
15
+ device_map="cuda",
16
+ )
17
+ self.processor = AutoProcessor.from_pretrained(model_path)
18
+
19
+ def __call__(self, video_path):
20
+ messages = [
21
+ {
22
+ "role": "user",
23
+ "content": [
24
+ {
25
+ "type": "video",
26
+ "video": video_path,
27
+ "max_pixels": 360 * 420,
28
+ "fps": 2.0,
29
+ },
30
+ {"type": "text", "text": SYSTEM_PROMPT},
31
+ ],
32
+ }
33
+ ]
34
+
35
+ text = self.processor.apply_chat_template(
36
+ messages, tokenize=False, add_generation_prompt=True
37
+ )
38
+ image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
39
+
40
+ inputs = self.processor(
41
+ text=[text],
42
+ images=image_inputs,
43
+ videos=video_inputs,
44
+ padding=True,
45
+ return_tensors="pt",
46
+ **video_kwargs,
47
+ )
48
+
49
+ inputs = inputs.to("cuda")
50
+
51
+ generated_ids = self.model.generate(**inputs, max_new_tokens=2048, temperature=0.05)
52
+ generated_ids_trimmed = [
53
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
54
+ ]
55
+ output_texts = self.processor.batch_decode(
56
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
57
+ )
58
+ caption = json.loads(output_texts[0])
59
+ caption = json.dumps(caption, indent=4, ensure_ascii=False)
60
+ return caption
61
+