ByteDance
/

Sa2VA-4B

@@ -1,15 +1,15 @@
 ---
-license: apache-2.0
-pipeline_tag: image-text-to-text
-library_name: transformers
 base_model:
-  - OpenGVLab/InternVL2.5-4B
-base_model_relation: merge
 language:
-  - multilingual
 tags:
-  - Sa2VA
-  - custom_code
 ---
 # Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
@@ -156,4 +156,4 @@ If you find this project useful in your research, please consider citing:
   journal={arXiv preprint},
   year={2025}
 }
-```

 ---
 base_model:
+- OpenGVLab/InternVL2.5-4B
 language:
+- multilingual
+library_name: transformers
+license: apache-2.0
+pipeline_tag: video-text-to-text
 tags:
+- Sa2VA
+- custom_code
+base_model_relation: merge
 ---
 # Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
   journal={arXiv preprint},
   year={2025}
 }
+```