Spaces:

prithivMLmods
/

DocScope-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on about 11 hours ago

Commit

221d2b6

verified ·

1 Parent(s): 2512ee8

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -11

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from threading import Thread
 import time
 import torch
 import spaces
 from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
@@ -33,6 +35,30 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
 </style>
     '''
 # Model and Processor Setup
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
@@ -57,19 +83,28 @@ def model_inference(message, history, use_docscopeocr):
     files = message.get("files", [])
     if not text and not files:
-        yield "Error: Please input a text query or provide image files."
         return
-    # Process files: images only
     image_list = []
     for idx, file in enumerate(files):
-        try:
-            img = load_image(file)
-            label = f"Image {idx+1}:"
-            image_list.append((label, img))
-        except Exception as e:
-            yield f"Error loading image: {str(e)}"
-            return
     # Build content list
     content = [{"type": "text", "text": text}]
@@ -123,9 +158,9 @@ demo = gr.ChatInterface(
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
-        file_types=["image"],
         file_count="multiple",
-        placeholder="Input your query and optionally upload image(s). Select the model using the checkbox."
     ),
     stop_btn="Stop Generation",
     multimodal=True,

 import time
 import torch
 import spaces
+import cv2
+import numpy as np
 from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
 </style>
     '''
+def downsample_video(video_path):
+    """
+    Downsamples a video file by extracting 10 evenly spaced frames.
+    Returns a list of tuples (PIL.Image, timestamp).
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
 # Model and Processor Setup
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
     files = message.get("files", [])
     if not text and not files:
+        yield "Error: Please input a text query or provide image or video files."
         return
+    # Process files: images and videos
     image_list = []
     for idx, file in enumerate(files):
+        if file.lower().endswith((".mp4", ".avi", ".mov")):
+            frames = downsample_video(file)
+            if not frames:
+                yield "Error: Could not extract frames from the video."
+                return
+            for frame, timestamp in frames:
+                label = f"Video {idx+1} Frame {timestamp}:"
+                image_list.append((label, frame))
+        else:
+            try:
+                img = load_image(file)
+                label = f"Image {idx+1}:"
+                image_list.append((label, img))
+            except Exception as e:
+                yield f"Error loading image: {str(e)}"
+                return
     # Build content list
     content = [{"type": "text", "text": text}]
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
+        file_types=["image", "video"],
         file_count="multiple",
+        placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
     ),
     stop_btn="Stop Generation",
     multimodal=True,