Spaces:

sitammeur
/

VidiQA

Running on Zero

App Files Files Community

sitammeur commited on Aug 19, 2024

Commit

bfaf6f1

verified ·

1 Parent(s): 2faf7d0

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +2 -0
app.py +48 -0
src/__init__.py +0 -0
src/model.py +62 -0
src/utils.py +50 -0
videos/sample_video_1.mp4 +3 -0
videos/sample_video_2.mp4 +0 -0
videos/sample_video_3.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+videos/sample_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/sample_video_3.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Importing the requirements
+# import warnings
+# warnings.filterwarnings("ignore")
+import gradio as gr
+from src.model import describe_video
+# Video and text inputs for the interface
+video = gr.Video(type="file", label="Video")
+query = gr.Textbox(label="Query", placeholder="Type your query here")
+# Output for the interface
+response = gr.Textbox(label="Response", show_label=True, show_copy_button=True)
+# Examples for the interface
+examples = [
+    [
+        "./videos/sample_video_1.mp4",
+        "Here are some frames of a video. Describe this video in detail",
+    ],
+    [
+        "./videos/sample_video_2.mp4",
+        "Which are the animals in this video, and how many are there?",
+    ],
+    ["./videos/sample_video_3.mp4", "What is happening in this video?"],
+]
+# Title, description, and article for the interface
+title = "Video Understanding & Question Answering"
+description = "This Gradio demo uses the MiniCPM-V-2_6 model for video understanding tasks. Upload a video and type a question to get a detailed description or specific information from the video."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2407.03320' target='_blank'>InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output</a> | <a href='https://huggingface.co/internlm/internlm-xcomposer2d5-7b' target='_blank'>Model Page</a></p>"
+# Launch the interface
+interface = gr.Interface(
+    fn=describe_video,
+    inputs=[video, query],
+    outputs=response,
+    examples=examples,
+    title=title,
+    description=description,
+    article=article,
+    theme="Soft",
+    allow_flagging="never",
+)
+interface.launch(debug=False)

src/__init__.py ADDED Viewed

File without changes

src/model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Importing the requirements
+import torch
+from transformers import AutoModel, AutoTokenizer
+import spaces
+from src.utils import encode_video
+# Device for the model
+device = "cuda"
+# Load the model and tokenizer
+model = AutoModel.from_pretrained(
+    "openbmb/MiniCPM-V-2_6",
+    trust_remote_code=True,
+    attn_implementation="sdpa",
+    torch_dtype=torch.bfloat16,
+)
+model = model.to(device=device)
+tokenizer = AutoTokenizer.from_pretrained(
+    "openbmb/MiniCPM-V-2_6", trust_remote_code=True
+)
+model.eval()
+@spaces.GPU()
+def describe_video(video, question):
+    """
+    Describes a video by generating an answer to a given question.
+    Args:
+        - video (str): The path to the video file.
+        - question (str): The question to be answered about the video.
+    Returns:
+        str: The generated answer to the question.
+    """
+    # Encode the video frames
+    frames = encode_video(video)
+    # Message format for the model
+    msgs = [{"role": "user", "content": frames + [question]}]
+    # Set decode params for video
+    params = {
+        "use_image_id": False,
+        "max_slice_nums": 1,  # Use 1 if CUDA OOM and video resolution > 448*448
+    }
+    # Generate the answer
+    answer = model.chat(
+        image=None,
+        msgs=msgs,
+        tokenizer=tokenizer,
+        sampling=True,
+        temperature=0.7,
+        stream=True,
+        system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
+        **params
+    )
+    # Return the answer
+    return answer

src/utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Importing the requirements
+from PIL import Image
+from decord import VideoReader, cpu
+# Maximum number of frames to use
+MAX_NUM_FRAMES = 64  # If CUDA OOM, set a smaller number
+def encode_video(video_path):
+    """
+    Encodes a video file into a list of frames.
+    Args:
+        video_path (str): The path to the video file.
+    Returns:
+        list: A list of frames, where each frame is represented as an Image object.
+    """
+    def uniform_sample(l, n):
+        """
+        Uniformly samples elements from a list.
+        Args:
+            - l (list): The input list.
+            - n (int): The number of elements to sample.
+        Returns:
+            list: A list of sampled elements.
+        """
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    # Read the video file and sample frames
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    # Uniformly sample frames if the number of frames is too large
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    # Extract frames from the video
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+    # Return video frames
+    return frames

videos/sample_video_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b277543103dc6b706cdc2b5007085e8eef0c6a9bdc39633e2af31828d7bd98e4
+size 2511799

videos/sample_video_2.mp4 ADDED Viewed

Binary file (826 kB). View file

videos/sample_video_3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e242b33923dd63ffb2fda6d6853f7ec8ad17207e6221b5467a540159fa1e5c06
+size 2104032