Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Running on Zero

App Files Files Community

yonigozlan HF Staff commited on Sep 26

Commit

5b5416f

1 Parent(s): ba25bef

update for zero gpu

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +67 -116
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 👀
 colorFrom: purple
 colorTo: indigo
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: purple
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.47.2
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import colorsys
 import gc
 from typing import Optional
 import gradio as gr
 import numpy as np
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw
 # Prefer local transformers in the workspace
-from transformers import Sam2VideoModel, Sam2VideoProcessor
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
@@ -52,10 +54,12 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
                 cap.release()
                 if fps_val and fps_val > 0:
                     info["fps"] = float(fps_val)
-            except Exception:
                 pass
         return pil_frames, info
-    except Exception:
         # Fallback to OpenCV
         try:
             import cv2  # type: ignore
@@ -115,7 +119,7 @@ def overlay_masks_on_frame(
 def get_device_and_dtype() -> tuple[str, torch.dtype]:
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.bfloat16
     return device, dtype
@@ -127,9 +131,9 @@ class AppState:
     def reset(self):
         self.video_frames: list[Image.Image] = []
         self.inference_session = None
-        self.model: Optional[Sam2VideoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
-        self.device: str = "cuda"
         self.dtype: torch.dtype = torch.bfloat16
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
@@ -153,6 +157,9 @@ class AppState:
         self.model_repo_id: str | None = None
         self.session_repo_id: str | None = None
     @property
     def num_frames(self) -> int:
         return len(self.video_frames)
@@ -168,29 +175,18 @@ def _model_repo_from_key(key: str) -> str:
     return mapping.get(key, mapping["base_plus"])
-def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, torch.dtype]:
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
         # Different repo requested: dispose current and reload
-        try:
-            del GLOBAL_STATE.model
-        except Exception:
-            pass
-        try:
-            del GLOBAL_STATE.processor
-        except Exception:
-            pass
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
     device, dtype = get_device_and_dtype()
     # free up the gpu memory
-    torch.cuda.empty_cache()
-    gc.collect()
-    print("device", device)
-    model = Sam2VideoModel.from_pretrained(desired_repo)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
     model.to(device, dtype=dtype)
@@ -216,24 +212,11 @@ def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
-            # Dispose previous session cleanly
-            try:
-                if GLOBAL_STATE.inference_session is not None:
-                    GLOBAL_STATE.inference_session.reset_inference_session()
-            except Exception:
-                pass
             GLOBAL_STATE.inference_session = None
-            gc.collect()
-            try:
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-            except Exception:
-                pass
             GLOBAL_STATE.inference_session = processor.init_video_session(
-                video=GLOBAL_STATE.video_frames,
                 inference_device=device,
                 video_storage_device="cpu",
-                torch_dtype=dtype,
             )
             GLOBAL_STATE.session_repo_id = desired_repo
@@ -267,43 +250,21 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     # Enforce max duration of 8 seconds (trim if longer)
     MAX_SECONDS = 8.0
     trimmed_note = ""
-    fps_in = None
-    if isinstance(info, dict) and info.get("fps"):
-        try:
-            fps_in = float(info["fps"]) or None
-        except Exception:
-            fps_in = None
-    if fps_in is not None:
-        max_frames_allowed = int(MAX_SECONDS * fps_in)
-        if len(frames) > max_frames_allowed:
-            frames = frames[:max_frames_allowed]
-            trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
-            if isinstance(info, dict):
-                info["num_frames"] = len(frames)
-    else:
-        # Fallback when FPS unknown: assume ~30 FPS and cap to 240 frames (~8s)
-        max_frames_allowed = 240
-        if len(frames) > max_frames_allowed:
-            frames = frames[:max_frames_allowed]
-            trimmed_note = " (trimmed to 240 frames ~8s @30fps)"
-            if isinstance(info, dict):
-                info["num_frames"] = len(frames)
     GLOBAL_STATE.video_frames = frames
     # Try to capture original FPS if provided by loader
-    GLOBAL_STATE.video_fps = None
-    if isinstance(info, dict) and info.get("fps"):
-        try:
-            GLOBAL_STATE.video_fps = float(info["fps"]) or None
-        except Exception:
-            GLOBAL_STATE.video_fps = None
     # Initialize session
     inference_session = processor.init_video_session(
-        video=frames,
         inference_device=device,
         video_storage_device="cpu",
-        torch_dtype=dtype,
     )
     GLOBAL_STATE.inference_session = inference_session
@@ -414,6 +375,12 @@ def on_image_click(
     processor = state.processor
     model = state.model
     inference_session = state.inference_session
     if state.current_prompt_type == "Boxes":
         # Two-click box input
@@ -445,6 +412,7 @@ def on_image_click(
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
                 clear_old_inputs=True,  # For boxes, always clear old inputs
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
@@ -467,6 +435,7 @@ def on_image_click(
             obj_ids=int(obj_id),
             input_points=[[[[int(x), int(y)]]]],
             input_labels=[[[int(label_int)]]],
             clear_old_inputs=bool(clear_old),
         )
@@ -478,12 +447,8 @@ def on_image_click(
         state.composited_frames.pop(int(frame_idx), None)
     # Forward on that frame
-    device_type = "cuda" if state.device == "cuda" else "cpu"
-    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=state.dtype):
-        outputs = model(
-            inference_session=inference_session,
-            frame_idx=int(frame_idx),
-        )
     H = inference_session.video_height
     W = inference_session.video_width
@@ -509,31 +474,37 @@ def on_image_click(
     return update_frame_display(state, int(frame_idx))
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
-        yield "Load a video first.", gr.update()
-        return
-    processor = GLOBAL_STATE.processor
-    model = GLOBAL_STATE.model
-    inference_session = GLOBAL_STATE.inference_session
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
     # Initial status; no slider change yet
-    yield f"Propagating masks: {processed}/{total}", gr.update()
-    device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
     last_frame_idx = 0
-    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
-        for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
-            frame_idx = int(sam2_video_output.frame_idx)
             last_frame_idx = frame_idx
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
@@ -546,16 +517,13 @@ def propagate_masks(GLOBAL_STATE: gr.State):
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
-            if processed % 15 == 0 or processed == total:
-                yield f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
-            else:
-                yield f"Propagating masks: {processed}/{total}", gr.update()
     # Final status; ensure slider points to last processed frame
-    yield (
-        f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects.",
-        gr.update(value=last_frame_idx),
-    )
 def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
@@ -581,11 +549,6 @@ def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, i
         pass
     GLOBAL_STATE.inference_session = None
     gc.collect()
-    try:
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    except Exception:
-        pass
     ensure_session_for_current_model(GLOBAL_STATE)
     # Keep current slider index if possible
@@ -786,29 +749,17 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         out_path = "/tmp/sam2_playback.mp4"
         # Prefer imageio with PyAV/ffmpeg to respect exact fps
         try:
-            import imageio.v3 as iio  # type: ignore
-            iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
             return out_path
-        except Exception:
-            # Fallbacks
-            try:
-                import imageio.v2 as imageio  # type: ignore
-                imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
-                return out_path
-            except Exception:
-                try:
-                    import cv2  # type: ignore
-                    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-                    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
-                    for fr_bgr in frames_np:
-                        writer.write(fr_bgr)
-                    writer.release()
-                    return out_path
-                except Exception as e:
-                    raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video])
@@ -816,7 +767,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
-        outputs=[propagate_status, frame_slider],
     )
     reset_btn.click(

 import colorsys
 import gc
+from copy import deepcopy
 from typing import Optional
 import gradio as gr
 import numpy as np
+import spaces
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw
 # Prefer local transformers in the workspace
+from transformers import AutoModel, Sam2VideoProcessor
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
                 cap.release()
                 if fps_val and fps_val > 0:
                     info["fps"] = float(fps_val)
+            except Exception as e:
+                print(f"Failed to render video with cv2: {e}")
                 pass
         return pil_frames, info
+    except Exception as e:
+        print(f"Failed to load video with transformers.video_utils: {e}")
         # Fallback to OpenCV
         try:
             import cv2  # type: ignore
 def get_device_and_dtype() -> tuple[str, torch.dtype]:
+    device = "cpu"
     dtype = torch.bfloat16
     return device, dtype
     def reset(self):
         self.video_frames: list[Image.Image] = []
         self.inference_session = None
+        self.model: Optional[AutoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
+        self.device: str = "cpu"
         self.dtype: torch.dtype = torch.bfloat16
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
         self.model_repo_id: str | None = None
         self.session_repo_id: str | None = None
+    def __repr__(self):
+        return f"AppState(video_frames={self.video_frames}, inference_session={self.inference_session is not None}, model={self.model is not None}, processor={self.processor is not None}, device={self.device}, dtype={self.dtype}, video_fps={self.video_fps}, masks_by_frame={self.masks_by_frame}, color_by_obj={self.color_by_obj}, clicks_by_frame_obj={self.clicks_by_frame_obj}, boxes_by_frame_obj={self.boxes_by_frame_obj}, composited_frames={self.composited_frames}, current_frame_idx={self.current_frame_idx}, current_obj_id={self.current_obj_id}, current_label={self.current_label}, current_clear_old={self.current_clear_old}, current_prompt_type={self.current_prompt_type}, pending_box_start={self.pending_box_start}, pending_box_start_frame_idx={self.pending_box_start_frame_idx}, pending_box_start_obj_id={self.pending_box_start_obj_id}, is_switching_model={self.is_switching_model}, model_repo_key={self.model_repo_key}, model_repo_id={self.model_repo_id}, session_repo_id={self.session_repo_id})"
     @property
     def num_frames(self) -> int:
         return len(self.video_frames)
     return mapping.get(key, mapping["base_plus"])
+def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[AutoModel, Sam2VideoProcessor, str, torch.dtype]:
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
         # Different repo requested: dispose current and reload
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
     device, dtype = get_device_and_dtype()
     # free up the gpu memory
+    model = AutoModel.from_pretrained(desired_repo)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
     model.to(device, dtype=dtype)
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
             GLOBAL_STATE.inference_session = None
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 inference_device=device,
                 video_storage_device="cpu",
+                dtype=dtype,
             )
             GLOBAL_STATE.session_repo_id = desired_repo
     # Enforce max duration of 8 seconds (trim if longer)
     MAX_SECONDS = 8.0
     trimmed_note = ""
+    fps_in = info.get("fps")
+    max_frames_allowed = int(MAX_SECONDS * fps_in)
+    if len(frames) > max_frames_allowed:
+        frames = frames[:max_frames_allowed]
+        trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
+        if isinstance(info, dict):
+            info["num_frames"] = len(frames)
     GLOBAL_STATE.video_frames = frames
     # Try to capture original FPS if provided by loader
+    GLOBAL_STATE.video_fps = float(fps_in)
     # Initialize session
     inference_session = processor.init_video_session(
         inference_device=device,
         video_storage_device="cpu",
+        dtype=dtype,
     )
     GLOBAL_STATE.inference_session = inference_session
     processor = state.processor
     model = state.model
     inference_session = state.inference_session
+    original_size = None
+    pixel_values = None
+    if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
+        inputs = processor(images=state.video_frames[frame_idx], device=state.device, return_tensors="pt")
+        original_size = inputs.original_sizes[0]
+        pixel_values = inputs.pixel_values[0]
     if state.current_prompt_type == "Boxes":
         # Two-click box input
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
                 clear_old_inputs=True,  # For boxes, always clear old inputs
+                original_size=original_size,
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             obj_ids=int(obj_id),
             input_points=[[[[int(x), int(y)]]]],
             input_labels=[[[int(label_int)]]],
+            original_size=original_size,
             clear_old_inputs=bool(clear_old),
         )
         state.composited_frames.pop(int(frame_idx), None)
     # Forward on that frame
+    with torch.inference_mode():
+        outputs = model(inference_session=inference_session, frame=pixel_values, frame_idx=int(frame_idx))
     H = inference_session.video_height
     W = inference_session.video_width
     return update_frame_display(state, int(frame_idx))
+@spaces.GPU()
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
+        # yield GLOBAL_STATE, "Load a video first.", gr.update()
+        return GLOBAL_STATE, "Load a video first.", gr.update()
+    processor = deepcopy(GLOBAL_STATE.processor)
+    model = deepcopy(GLOBAL_STATE.model)
+    inference_session = deepcopy(GLOBAL_STATE.inference_session)
+    # set inference device to cuda to use zero gpu
+    inference_session.inference_device = "cuda"
+    inference_session.cache.inference_device = "cuda"
+    model.to("cuda")
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
     # Initial status; no slider change yet
+    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
+    with torch.inference_mode():
+        for frame_idx, frame in enumerate(GLOBAL_STATE.video_frames):
+            pixel_values = None
+            if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
+                pixel_values = processor(images=frame, device="cuda", return_tensors="pt").pixel_values[0]
+            sam2_video_output = model(inference_session=inference_session, frame=pixel_values, frame_idx=frame_idx)
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             last_frame_idx = frame_idx
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
+            if processed % 30 == 0 or processed == total:
+                yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+    text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
     # Final status; ensure slider points to last processed frame
+    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)
 def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
         pass
     GLOBAL_STATE.inference_session = None
     gc.collect()
     ensure_session_for_current_model(GLOBAL_STATE)
     # Keep current slider index if possible
         out_path = "/tmp/sam2_playback.mp4"
         # Prefer imageio with PyAV/ffmpeg to respect exact fps
         try:
+            import cv2  # type: ignore
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
+            for fr_bgr in frames_np:
+                writer.write(fr_bgr)
+            writer.release()
             return out_path
+        except Exception as e:
+            print(f"Failed to render video with cv2: {e}")
+            raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video])
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
+        outputs=[GLOBAL_STATE, propagate_status, frame_slider],
     )
     reset_btn.click(

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 gradio
-git+https://github.com/SangbumChoi/transformers.git@sam2
 torch
 torchvision
 pillow

 gradio
+git+https://github.com/yonigozlan/transformers.git@add-edgetam
 torch
 torchvision
 pillow