Spaces:
Sleeping
Sleeping
舟勤
commited on
Commit
•
8eaeb01
1
Parent(s):
15554e1
test
Browse files- .gitattributes +1 -0
- app.py +45 -11
- eval_configs/video_llama_eval.yaml +1 -1
.gitattributes
CHANGED
@@ -31,4 +31,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -59,16 +59,16 @@ def setup_seeds(config):
|
|
59 |
|
60 |
print('Initializing Chat')
|
61 |
args = parse_args()
|
62 |
-
cfg = Config(args)
|
63 |
|
64 |
-
model_config = cfg.model_cfg
|
65 |
-
model_config.device_8bit = args.gpu_id
|
66 |
-
model_cls = registry.get_model_class(model_config.arch)
|
67 |
-
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
|
68 |
|
69 |
-
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
|
70 |
-
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
|
71 |
-
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
|
72 |
print('Initialization Finished')
|
73 |
|
74 |
# ========================================
|
@@ -143,6 +143,9 @@ title = """
|
|
143 |
|
144 |
<h1 align="center">Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding</h1>
|
145 |
|
|
|
|
|
|
|
146 |
<div style='display:flex; gap: 0.25rem; '>
|
147 |
<a href='https://github.com/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/Github-Code-success'></a>
|
148 |
<a href='https://huggingface.co/spaces/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
|
@@ -152,8 +155,24 @@ title = """
|
|
152 |
</div>
|
153 |
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
"""
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
#TODO show examples below
|
158 |
|
159 |
with gr.Blocks() as demo:
|
@@ -185,20 +204,35 @@ with gr.Blocks() as demo:
|
|
185 |
label="Temperature",
|
186 |
)
|
187 |
|
|
|
|
|
188 |
with gr.Column():
|
189 |
chat_state = gr.State()
|
190 |
img_list = gr.State()
|
191 |
chatbot = gr.Chatbot(label='Video-LLaMA')
|
192 |
text_input = gr.Textbox(label='User', placeholder='Please upload your image/video first', interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
-
|
195 |
upload_button.click(upload_imgorvideo, [video, image, text_input, chat_state], [video, image, text_input, upload_button, chat_state, img_list])
|
196 |
|
197 |
text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
|
198 |
gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
|
199 |
)
|
200 |
clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, image, text_input, upload_button, chat_state, img_list], queue=False)
|
201 |
-
|
202 |
demo.launch(share=False, enable_queue=True)
|
203 |
|
204 |
-
# %%
|
|
|
59 |
|
60 |
print('Initializing Chat')
|
61 |
args = parse_args()
|
62 |
+
# cfg = Config(args)
|
63 |
|
64 |
+
# model_config = cfg.model_cfg
|
65 |
+
# model_config.device_8bit = args.gpu_id
|
66 |
+
# model_cls = registry.get_model_class(model_config.arch)
|
67 |
+
# model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
|
68 |
|
69 |
+
# vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
|
70 |
+
# vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
|
71 |
+
# chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
|
72 |
print('Initialization Finished')
|
73 |
|
74 |
# ========================================
|
|
|
143 |
|
144 |
<h1 align="center">Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding</h1>
|
145 |
|
146 |
+
<h5 align="center"> Introduction: Video-LLaMA is a multi-model large language model that achieves video-grounded conversations between humans and computers \
|
147 |
+
by connecting language decoder with off-the-shelf unimodal pre-trained models. </h5>
|
148 |
+
|
149 |
<div style='display:flex; gap: 0.25rem; '>
|
150 |
<a href='https://github.com/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/Github-Code-success'></a>
|
151 |
<a href='https://huggingface.co/spaces/DAMO-NLP-SG/Video-LLaMA'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
|
|
|
155 |
</div>
|
156 |
|
157 |
|
158 |
+
Thank you for using the Video-LLaMA Demo Page! If you have any questions or feedback, feel free to contact us.
|
159 |
+
|
160 |
+
If you think Video-LLaMA interesting, please give us a star on GitHub.
|
161 |
+
|
162 |
+
Current online demo uses the 7B version of PandaGPT due to resource limitations. We have released \
|
163 |
+
the 13B version on our GitHub repository.
|
164 |
+
|
165 |
+
|
166 |
"""
|
167 |
|
168 |
+
Note_markdown = ("""
|
169 |
+
### Note
|
170 |
+
Video-LLaMA is a prototype model and may have limitations in understanding complex scenes, long videos, or specific domains.
|
171 |
+
The output results may be influenced by input quality, limitations of the dataset, and the model's susceptibility to illusions. Please interpret the results with caution.
|
172 |
+
|
173 |
+
**Copyright 2023 Alibaba DAMO Academy.**
|
174 |
+
""")
|
175 |
+
|
176 |
#TODO show examples below
|
177 |
|
178 |
with gr.Blocks() as demo:
|
|
|
204 |
label="Temperature",
|
205 |
)
|
206 |
|
207 |
+
audio = gr.Checkbox(interactive=True, value=False, label="Audio")
|
208 |
+
gr.Markdown(Note_markdown)
|
209 |
with gr.Column():
|
210 |
chat_state = gr.State()
|
211 |
img_list = gr.State()
|
212 |
chatbot = gr.Chatbot(label='Video-LLaMA')
|
213 |
text_input = gr.Textbox(label='User', placeholder='Please upload your image/video first', interactive=False)
|
214 |
+
|
215 |
+
|
216 |
+
with gr.Column():
|
217 |
+
gr.Examples(examples=[
|
218 |
+
[f"examples/dog.jpg", "What breed do you think this dog is ?"],
|
219 |
+
[f"examples/jonsnow.jpg", "Who's the man on the right? "],
|
220 |
+
[f"examples/statue_of_liberty.jpg", "Can you tell me about this building? "],
|
221 |
+
], inputs=[image, text_input])
|
222 |
+
|
223 |
+
gr.Examples(examples=[
|
224 |
+
[f"examples/skateboarding_dog.mp4", "What is the dog doing? "],
|
225 |
+
[f"examples/birthday.mp4", "What is the boy doing? "],
|
226 |
+
[f"examples/Iron_Man.mp4", "Is the guy in the video Iron Man? "],
|
227 |
+
], inputs=[video, text_input])
|
228 |
|
|
|
229 |
upload_button.click(upload_imgorvideo, [video, image, text_input, chat_state], [video, image, text_input, upload_button, chat_state, img_list])
|
230 |
|
231 |
text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
|
232 |
gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
|
233 |
)
|
234 |
clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, image, text_input, upload_button, chat_state, img_list], queue=False)
|
235 |
+
|
236 |
demo.launch(share=False, enable_queue=True)
|
237 |
|
238 |
+
# %%
|
eval_configs/video_llama_eval.yaml
CHANGED
@@ -3,7 +3,7 @@ model:
|
|
3 |
model_type: pretrain_vicuna
|
4 |
freeze_vit: True
|
5 |
freeze_qformer: True
|
6 |
-
max_txt_len:
|
7 |
end_sym: "###"
|
8 |
low_resource: False
|
9 |
|
|
|
3 |
model_type: pretrain_vicuna
|
4 |
freeze_vit: True
|
5 |
freeze_qformer: True
|
6 |
+
max_txt_len: 140
|
7 |
end_sym: "###"
|
8 |
low_resource: False
|
9 |
|