Spaces:

Skywork
/

SkyCaptioner-V1

Running on Zero

App Files Files Community

pinoo commited on Apr 30

Commit

0d1c12c

1 Parent(s): 8397d3d

init commit

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app.py +90 -0
examples/1.mp4 +3 -0
examples/2.mp4 +3 -0
examples/3.mp4 +3 -0
examples/4.mp4 +3 -0
fusion_caption.py +184 -0
struct_caption.py +61 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/*.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import spaces
+import gradio as gr
+from struct_caption import StructCaptioner
+from fusion_caption import FusionCaptioner
+struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
+fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        <h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
+        """,
+        elem_id="header"
+    )
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            video_input = gr.Video(
+                label="Upload Video",
+                interactive=True,
+                format="mp4",
+            )
+            btn_struct = gr.Button("Generate Struct Caption")
+        with gr.Column():
+            struct_caption_output = gr.Code(
+                label="Struct Caption",
+                language="json",
+                lines=25,
+                interactive=False
+            )
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            with gr.Row():
+                task_input = gr.Radio(
+                    label="Task Type",
+                    choices=["t2v", "i2v"],
+                    value="t2v",
+                    interactive=True
+                )
+            btn_fusion = gr.Button("Generate Fusion Caption")
+        with gr.Column():
+            fusion_caption_output = gr.Textbox(
+                label="Fusion Caption",
+                value="",
+                interactive=False
+            )
+    @spaces.GPU(duration=120)
+    def generate_struct_caption(video):
+        struct_caption = struct_captioner(video)
+        return struct_caption
+    @spaces.GPU(duration=120)
+    def generate_fusion_caption(struct_caption_str, task):
+        return fusion_captioner(struct_caption_str, task)
+    btn_struct.click(
+        fn=generate_struct_caption,
+        inputs=video_input,
+        outputs=struct_caption_output
+    )
+    btn_fusion.click(
+        fn=generate_fusion_caption,
+        inputs=[struct_caption_output, task_input],
+        outputs=fusion_caption_output
+    )
+    gr.Examples(
+        examples=[
+            ["./examples/1.mp4"],
+            ["./examples/2.mp4"],
+            ["./examples/3.mp4"],
+            ["./examples/4.mp4"],
+        ],
+        inputs=video_input,
+        label="Example Videos"
+    )
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7872,
+        share=False
+    )

examples/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16becc00811e427d9e3f5da5a977239907ea3a6d45b5482b9bcfea2abc3c6b7f
+size 4821469

examples/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:109b954b7b1e36addef840554af53241415492e71972c87b7bfb03f77bf0d68a
+size 1030652

examples/3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd544787b24265605cf16c954f495fe9d73680e0529f1438e47c02803a9c2bf
+size 1661366

examples/4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5cc4038924aae963a54779d713f7f9680259ae81a4446fb2775cc6bb51d907c
+size 2523332

fusion_caption.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import json
+import random
+import pandas as pd
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+SYSTEM_PROMPT_I2V = """
+You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
+## Structured Input
+{structured_input}
+## Notes
+1. If there has an empty field, just ignore it and do not mention it in the output.
+2. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
+3. If the action field is not empty, eliminate the irrelevant information in the action field that is not related to the timing action(such as wearings, background and environment information) to make a pure action field.
+## Output Principles and Orders
+1. First, eliminate the static information in the action field that is not related to the timing action, such as background or environment information.
+2. Second, describe each subject with its pure action and expression if these fields exist.
+## Output
+Please directly output the final composed caption without any additional information.
+"""
+SYSTEM_PROMPT_T2V = """
+You are an expert in video captioning. You are given a structured video caption and you need to compose it to be more natural and fluent in English.
+## Structured Input
+{structured_input}
+## Notes
+1. According to the action field information, change its name field to the subject pronoun in the action.
+2. If there has an empty field, just ignore it and do not mention it in the output.
+3. Do not make any semantic changes to the original fields. Please be sure to follow the original meaning.
+## Output Principles and Orders
+1. First, declare the shot_type, then declare the shot_angle and the shot_position fields in natural and fluent.
+2. Second, eliminate information in the action field that is not related to the timing action, such as background or environment information if action is not empty.
+3. Third, describe each subject with its pure action, appearance, expression, position if these fields exist.
+4. Finally, declare the environment and lighting if the environment and lighting fields are not empty.
+## Output
+Please directly output the final composed caption without any additional information.
+"""
+class StructuralCaptionDataset(torch.utils.data.Dataset):
+    def __init__(self, input_csv, model_path, task):
+        if isinstance(input_csv, pd.DataFrame):
+            self.meta = input_csv
+        else:
+            self.meta = pd.read_csv(input_csv)
+        self.task = task
+        self.system_prompt = SYSTEM_PROMPT_T2V if self.task == 't2v' else SYSTEM_PROMPT_I2V
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+    def __len__(self):
+        return len(self.meta)
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        real_index = self.meta.index[index]
+        struct_caption = json.loads(row["structural_caption"])
+        camera_movement = struct_caption.get('camera_motion', '')
+        if camera_movement != '':
+            camera_movement += '.'
+        camera_movement = camera_movement.capitalize()
+        fusion_by_llm = False
+        cleaned_struct_caption = self.clean_struct_caption(struct_caption, self.task)
+        if cleaned_struct_caption.get('num_subjects', 0) > 0:
+            new_struct_caption = json.dumps(cleaned_struct_caption, indent=4, ensure_ascii=False)
+            conversation = [
+                {
+                    "role": "user",
+                    "content": self.system_prompt.format(structured_input=new_struct_caption),
+                },
+            ]
+            text = self.tokenizer.apply_chat_template(
+                conversation,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
+            fusion_by_llm = True
+        else:
+            text = '-'
+        return real_index, fusion_by_llm, text, '-', camera_movement
+    def clean_struct_caption(self, struct_caption, task):
+        raw_subjects = struct_caption.get('subjects', [])
+        subjects = []
+        for subject in raw_subjects:
+            subject_type = subject.get("TYPES", {}).get('type', '')
+            subject_sub_type = subject.get("TYPES", {}).get('sub_type', '')
+            if subject_type not in ["Human", "Animal"]:
+                subject['expression'] = ''
+            if subject_type == 'Human' and subject_sub_type == 'Accessory':
+                subject['expression'] = ''
+            if subject_sub_type != '':
+                subject['name'] = subject_sub_type
+            if 'TYPES' in subject:
+                del subject['TYPES']
+            if 'is_main_subject' in subject:
+                del subject['is_main_subject']
+            subjects.append(subject)
+        to_del_subject_ids = []
+        for idx, subject in enumerate(subjects):
+            action = subject.get('action', '').strip()
+            subject['action'] = action
+            if random.random() > 0.9 and 'appearance' in subject:
+                del subject['appearance']
+            if random.random() > 0.9 and 'position' in subject:
+                del subject['position']
+            if task == 'i2v':
+                # just keep name and action, expression in subjects
+                dropped_keys = ['appearance', 'position']
+                for key in dropped_keys:
+                    if key in subject:
+                        del subject[key]
+                if subject['action'] == '' and ('expression' not in subject or subject['expression'] == ''):
+                    to_del_subject_ids.append(idx)
+        # delete the subjects according to the to_del_subject_ids
+        for idx in sorted(to_del_subject_ids, reverse=True):
+            del subjects[idx]
+        new_struct_caption = {
+            'num_subjects': len(subjects),
+            'subjects': subjects,
+            'shot_type': struct_caption.get('shot_type', ''),
+            'shot_angle': struct_caption.get('shot_angle', ''),
+            'shot_position': struct_caption.get('shot_position', ''),
+            'environment': struct_caption.get('environment', ''),
+            'lighting': struct_caption.get('lighting', ''),
+        }
+        if task == 't2v' and random.random() > 0.9:
+            del new_struct_caption['lighting']
+        if task == 'i2v':
+            drop_keys = ['environment', 'lighting', 'shot_type', 'shot_angle', 'shot_position']
+            for drop_key in drop_keys:
+                del new_struct_caption[drop_key]
+        return new_struct_caption
+class FusionCaptioner:
+    def __init__(self, model_path):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            device_map="cuda",
+        )
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+    def __call__(self, structural_caption, task='t2v'):
+        if isinstance(structural_caption, dict):
+            structural_caption = json.dumps(structural_caption, ensure_ascii=False)
+        else:
+            structural_caption = json.dumps(json.loads(structural_caption), ensure_ascii=False)
+        meta = pd.DataFrame([structural_caption], columns=['structural_caption'])
+        dataset = StructuralCaptionDataset(meta, self.model_path, task)
+        _, fusion_by_llm, text, original_text, camera_movement = dataset[0]
+        if not fusion_by_llm:
+            caption = original_text + " " + camera_movement
+            return caption
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+        generated_ids = self.model.generate(**model_inputs, max_new_tokens=1024, temperature=0.1)
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        result = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        llm_caption = result + " " + camera_movement
+        return llm_caption

struct_caption.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import json
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+SYSTEM_PROMPT = "I need you to generate a structured and detailed caption for the provided video. The structured output and the requirements for each field are as shown in the following JSON content: {\"subjects\": [{\"appearance\": \"Main subject appearance description\", \"action\": \"Main subject action\", \"expression\": \"Main subject expression  (Only for human/animal categories, empty otherwise)\", \"position\": \"Subject position in the video (Can be relative position to other objects or spatial description)\", \"TYPES\": {\"type\": \"Main category (e.g., Human)\", \"sub_type\": \"Sub-category (e.g., Man)\"}, \"is_main_subject\": true}, {\"appearance\": \"Non-main subject appearance description\", \"action\": \"Non-main subject action\", \"expression\": \"Non-main subject expression (Only for human/animal categories, empty otherwise)\", \"position\": \"Position of non-main subject 1\", \"TYPES\": {\"type\": \"Main category (e.g., Vehicles)\", \"sub_type\": \"Sub-category (e.g., Ship)\"}, \"is_main_subject\": false}], \"shot_type\": \"Shot type(Options: long_shot/full_shot/medium_shot/close_up/extreme_close_up/other)\", \"shot_angle\": \"Camera angle(Options: eye_level/high_angle/low_angle/other)\", \"shot_position\": \"Camera position(Options: front_view/back_view/side_view/over_the_shoulder/overhead_view/point_of_view/aerial_view/overlooking_view/other)\", \"camera_motion\": \"Camera movement description\", \"environment\": \"Video background/environment description\", \"lighting\": \"Lighting information in the video\"}"
+class StructCaptioner:
+    def __init__(self, model_path):
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            # attn_implementation="flash_attention_2",
+            device_map="cuda",
+        )
+        self.processor = AutoProcessor.from_pretrained(model_path)
+    def __call__(self, video_path):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        "max_pixels": 360 * 420,
+                        "fps": 2.0,
+                    },
+                    {"type": "text", "text": SYSTEM_PROMPT},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        inputs = inputs.to("cuda")
+        generated_ids = self.model.generate(**inputs, max_new_tokens=2048, temperature=0.05)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_texts = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        caption = json.loads(output_texts[0])
+        caption = json.dumps(caption, indent=4, ensure_ascii=False)
+        return caption