Spaces:

lixin4ever
/

VideoRefer-VideoLLaMA3

Running on Zero

App Files Files Community

feat: Enable MCP

by multimodalart HF Staff - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+119

-1

Files changed (1) hide show

app.py +119 -1

app.py CHANGED Viewed

@@ -51,6 +51,22 @@ def add_contour(img, mask, color=(1., 1., 1.)):
 @spaces.GPU(duration=120)
 def generate_masks(image, mask_list, mask_raw_list):
     image['image'] = image['background'].convert('RGB')
     # del image['background'], image['composite']
     assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
@@ -77,6 +93,23 @@ def generate_masks(image, mask_list, mask_raw_list):
 @spaces.GPU(duration=120)
 def generate_masks_video(image, mask_list_video, mask_raw_list_video):
     image['image'] = image['background'].convert('RGB')
     # del image['background'], image['composite']
     assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
@@ -104,6 +137,21 @@ def generate_masks_video(image, mask_list_video, mask_raw_list_video):
 @spaces.GPU(duration=120)
 def describe(image, mode, query, masks):
     # Create an image object from the uploaded image
     # print(image.keys())
@@ -194,6 +242,18 @@ def describe(image, mode, query, masks):
 def load_first_frame(video_path):
     cap = cv2.VideoCapture(video_path)
     ret, frame = cap.read()
     cap.release()
@@ -205,6 +265,25 @@ def load_first_frame(video_path):
 @spaces.GPU(duration=120)
 def describe_video(video_path, mode, query, annotated_frame, masks, mask_list_video):
     # Create a temporary directory to save extracted video frames
     cap = cv2.VideoCapture(video_path)
@@ -312,6 +391,18 @@ def describe_video(video_path, mode, query, annotated_frame, masks, mask_list_vi
 @spaces.GPU(duration=120)
 def apply_sam(image, input_points):
     inputs = sam_processor(image, input_points=input_points, return_tensors="pt").to(device)
     with torch.no_grad():
@@ -328,6 +419,13 @@ def apply_sam(image, input_points):
 def clear_masks():
     return [], [], []
@@ -459,6 +557,16 @@ if __name__ == "__main__":
         def toggle_query_and_generate_button(mode):
             query_visible = mode == "QA"
             caption_visible = mode == "Caption"
             return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), gr.update(visible=caption_visible), [], "", [], [],[],[]
@@ -468,6 +576,16 @@ if __name__ == "__main__":
         mode.change(toggle_query_and_generate_button, inputs=mode, outputs=[query, generate_mask_btn, clear_masks_btn, submit_btn1, mask_output, output_image, submit_btn, mask_output, description, mask_list, mask_raw_list, mask_list_video, mask_raw_list_video])
         def toggle_query_and_generate_button_video(mode):
             query_visible = mode == "QA"
             caption_visible = mode == "Caption"
             return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), [], [], [], [], []
@@ -537,4 +655,4 @@ if __name__ == "__main__":
     model, processor, tokenizer = model_init(args_cli.model_path)
-    demo.launch()

 @spaces.GPU(duration=120)
 def generate_masks(image, mask_list, mask_raw_list):
+    """
+    Generates segmentation masks for selected regions in an image using SAM.
+    Args:
+        image (dict): A dictionary containing image data, typically from a Gradio ImageEditor,
+                      with 'background' (PIL Image) and 'layers' (list of PIL Image layers).
+        mask_list (list): A list to accumulate (mask_image, label) tuples for display in a gallery.
+        mask_raw_list (list): A list to accumulate raw NumPy mask arrays.
+    Returns:
+        tuple: A tuple containing:
+               - mask_list (list): Updated list of mask images for display.
+               - image (dict): Updated image dictionary with layers cleared.
+               - mask_list (list): Redundant return of mask_list (for Gradio update).
+               - mask_raw_list (list): Updated list of raw mask arrays.
+    """
     image['image'] = image['background'].convert('RGB')
     # del image['background'], image['composite']
     assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
 @spaces.GPU(duration=120)
 def generate_masks_video(image, mask_list_video, mask_raw_list_video):
+    """
+    Generates segmentation masks for selected regions in the first frame of a video using SAM.
+    Args:
+        image (dict): A dictionary containing image data (first frame of video),
+                      typically from a Gradio ImageEditor, with 'background' (PIL Image)
+                      and 'layers' (list of PIL Image layers).
+        mask_list_video (list): A list to accumulate (mask_image, label) tuples for display.
+        mask_raw_list_video (list): A list to accumulate raw NumPy mask arrays for video processing.
+    Returns:
+        tuple: A tuple containing:
+               - mask_list_video (list): Updated list of mask images for display.
+               - image (dict): Updated image dictionary with layers cleared.
+               - mask_list_video (list): Redundant return of mask_list_video (for Gradio update).
+               - mask_raw_list_video (list): Updated list of raw mask arrays.
+    """
     image['image'] = image['background'].convert('RGB')
     # del image['background'], image['composite']
     assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
 @spaces.GPU(duration=120)
 def describe(image, mode, query, masks):
+    """
+    Describes an image based on selected regions or answers a question about them.
+    Args:
+        image (dict): A dictionary containing image data, typically from a Gradio ImageEditor,
+                      with 'background' (PIL Image) and 'layers' (list of PIL Image layers).
+        mode (str): The operational mode, either "Caption" (to describe a selected region)
+                    or "QA" (to answer a question about one or more regions).
+        query (str): The question to ask in "QA" mode. Ignored in "Caption" mode.
+        masks (list): A list of raw NumPy mask arrays representing previously generated masks.
+    Yields:
+        tuple: An image with contours and the generated text description/answer,
+               or updates for Gradio components during streaming.
+    """
     # Create an image object from the uploaded image
     # print(image.keys())
 def load_first_frame(video_path):
+    """
+    Loads the first frame of a given video file.
+    Args:
+        video_path (str): The file path to the video.
+    Returns:
+        PIL.Image.Image: The first frame of the video as a PIL Image.
+    Raises:
+        gr.Error: If the video file cannot be read.
+    """
     cap = cv2.VideoCapture(video_path)
     ret, frame = cap.read()
     cap.release()
 @spaces.GPU(duration=120)
 def describe_video(video_path, mode, query, annotated_frame, masks, mask_list_video):
+    """
+    Describes a video based on selected regions in its first frame or answers a question about them.
+    Args:
+        video_path (str): The file path to the video.
+        mode (str): The operational mode, either "Caption" (to describe a selected region)
+                    or "QA" (to answer a question about one or more regions).
+        query (str): The question to ask in "QA" mode. Ignored in "Caption" mode.
+        annotated_frame (dict): A dictionary containing the first frame's image data
+                                from a Gradio ImageEditor, with 'background' (PIL Image)
+                                and 'layers' (list of PIL Image layers).
+        masks (list): A list of raw NumPy mask arrays representing previously generated masks
+                      for objects in the video.
+        mask_list_video (list): A list to accumulate (mask_image, label) tuples for display.
+    Yields:
+        tuple: The annotated first frame, the generated text description/answer,
+               and updated mask lists for Gradio components during streaming.
+    """
     # Create a temporary directory to save extracted video frames
     cap = cv2.VideoCapture(video_path)
 @spaces.GPU(duration=120)
 def apply_sam(image, input_points):
+    """
+    Applies the Segment Anything Model (SAM) to an image based on input points
+    to generate a segmentation mask.
+    Args:
+        image (PIL.Image.Image): The input image.
+        input_points (list): A list of lists, where each inner list contains
+                             [x, y] coordinates representing points used for segmentation.
+    Returns:
+        numpy.ndarray: The selected binary segmentation mask as a NumPy array (H, W).
+    """
     inputs = sam_processor(image, input_points=input_points, return_tensors="pt").to(device)
     with torch.no_grad():
 def clear_masks():
+    """
+    Clears the stored lists of masks and raw masks.
+    Returns:
+        tuple: Three empty lists, intended to reset Gradio components
+               displaying masks.
+    """
     return [], [], []
         def toggle_query_and_generate_button(mode):
+            """
+            Toggles the visibility of query-related Gradio components based on the selected mode.
+            Also clears mask states.
+            Args:
+                mode (str): The selected mode ("Caption" or "QA").
+            Returns:
+                tuple: A series of gr.update() calls and empty lists to update Gradio components.
+            """
             query_visible = mode == "QA"
             caption_visible = mode == "Caption"
             return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), gr.update(visible=caption_visible), [], "", [], [],[],[]
         mode.change(toggle_query_and_generate_button, inputs=mode, outputs=[query, generate_mask_btn, clear_masks_btn, submit_btn1, mask_output, output_image, submit_btn, mask_output, description, mask_list, mask_raw_list, mask_list_video, mask_raw_list_video])
         def toggle_query_and_generate_button_video(mode):
+            """
+            Toggles the visibility of query-related Gradio components for video mode
+            based on the selected mode. Also clears mask states.
+            Args:
+                mode (str): The selected mode ("Caption" or "QA").
+            Returns:
+                tuple: A series of gr.update() calls and empty lists to update Gradio components.
+            """
             query_visible = mode == "QA"
             caption_visible = mode == "Caption"
             return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), [], [], [], [], []
     model, processor, tokenizer = model_init(args_cli.model_path)
+    demo.launch(mcp_server=True)