Spaces:

facebook
/

EdgeTAM

Running on Zero

App Files Files Community

chongzhou commited on 18 days ago

Commit

917decd

1 Parent(s): 282a45a

set device in inference_state manually

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +51 -43

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 *.egg-info/
 __pycache__/

 *.egg-info/
 __pycache__/
+*.DS_Store

app.py CHANGED Viewed

@@ -174,6 +174,8 @@ def preprocess_video_in(
     input_labels,
     inference_state,
 ):
     if video_path is None:
         return (
             gr.update(open=True),  # video_in_drawer
@@ -255,6 +257,8 @@ def segment_with_points(
     inference_state,
     evt: gr.SelectData,
 ):
     input_points.append(evt.index)
     print(f"TRACKING INPUT POINT: {input_points}")
@@ -336,55 +340,59 @@ def propagate_to_all(
     input_points,
     inference_state,
 ):
-    # torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-    predictor.to("cuda")
-    if len(input_points) == 0 or video_in is None or inference_state is None:
-        return None
-    # run propagation throughout the video and collect the results in a dict
-    video_segments = {}  # video_segments contains the per-frame segmentation results
-    print("starting propagate_in_video")
-    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-        inference_state
-    ):
-        video_segments[out_frame_idx] = {
-            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-            for i, out_obj_id in enumerate(out_obj_ids)
-        }
-    # obtain the segmentation results every few frames
-    vis_frame_stride = 1
-    output_frames = []
-    for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
-        transparent_background = Image.fromarray(all_frames[out_frame_idx]).convert(
-            "RGBA"
         )
-        out_mask = video_segments[out_frame_idx][OBJ_ID]
-        mask_image = show_mask(out_mask)
-        output_frame = Image.alpha_composite(transparent_background, mask_image)
-        output_frame = np.array(output_frame)
-        output_frames.append(output_frame)
-    torch.cuda.empty_cache()
-    # Create a video clip from the image sequence
-    original_fps = get_video_fps(video_in)
-    fps = original_fps  # Frames per second
-    clip = ImageSequenceClip(output_frames, fps=fps)
-    # Write the result to a file
-    unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
-    final_vid_output_path = f"output_video_{unique_id}.mp4"
-    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
-    # Write the result to a file
-    clip.write_videofile(final_vid_output_path, codec="libx264")
-    return gr.update(value=final_vid_output_path)
 def update_ui():

     input_labels,
     inference_state,
 ):
+    predictor.to("cpu")
+    inference_state["device"] = predictor.device
     if video_path is None:
         return (
             gr.update(open=True),  # video_in_drawer
     inference_state,
     evt: gr.SelectData,
 ):
+    predictor.to("cpu")
+    inference_state["device"] = predictor.device
     input_points.append(evt.index)
     print(f"TRACKING INPUT POINT: {input_points}")
     input_points,
     inference_state,
 ):
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        predictor.to("cuda")
+        inference_state["device"] = predictor.device
+        if len(input_points) == 0 or video_in is None or inference_state is None:
+            return None
+        # run propagation throughout the video and collect the results in a dict
+        video_segments = (
+            {}
+        )  # video_segments contains the per-frame segmentation results
+        print("starting propagate_in_video")
+        for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+            inference_state
+        ):
+            video_segments[out_frame_idx] = {
+                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                for i, out_obj_id in enumerate(out_obj_ids)
+            }
+        # obtain the segmentation results every few frames
+        vis_frame_stride = 1
+        output_frames = []
+        for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
+            transparent_background = Image.fromarray(all_frames[out_frame_idx]).convert(
+                "RGBA"
+            )
+            out_mask = video_segments[out_frame_idx][OBJ_ID]
+            mask_image = show_mask(out_mask)
+            output_frame = Image.alpha_composite(transparent_background, mask_image)
+            output_frame = np.array(output_frame)
+            output_frames.append(output_frame)
+        torch.cuda.empty_cache()
+        # Create a video clip from the image sequence
+        original_fps = get_video_fps(video_in)
+        fps = original_fps  # Frames per second
+        clip = ImageSequenceClip(output_frames, fps=fps)
+        # Write the result to a file
+        unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
+        final_vid_output_path = f"output_video_{unique_id}.mp4"
+        final_vid_output_path = os.path.join(
+            tempfile.gettempdir(), final_vid_output_path
         )
+        # Write the result to a file
+        clip.write_videofile(final_vid_output_path, codec="libx264")
+        return gr.update(value=final_vid_output_path)
 def update_ui():