Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 5

Commit

428b15d

verified ·

1 Parent(s): 1c5b159

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -51

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from transformers import (
     TextIteratorStreamer,
 )
 from transformers import Qwen2_5_VLForConditionalGeneration
-from pdf2image import convert_from_path
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
@@ -79,25 +78,19 @@ docscopeocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
-def model_inference(text, files, history, use_docscopeocr):
     if not text and not files:
-        yield "Error: Please input a text query or provide files (images, videos, PDFs)."
         return
-    # Process files: images, videos, PDFs
     image_list = []
-    for idx, file in enumerate(files or []):
-        if file.name.lower().endswith(".pdf"):
-            try:
-                pdf_images = convert_from_path(file.name)
-                for page_num, img in enumerate(pdf_images, start=1):
-                    label = f"PDF {idx+1} Page {page_num}:"
-                    image_list.append((label, img))
-            except Exception as e:
-                yield f"Error converting PDF: {str(e)}"
-                return
-        elif file.name.lower().endswith((".mp4", ".avi", ".mov")):
-            frames = downsample_video(file.name)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
@@ -106,7 +99,7 @@ def model_inference(text, files, history, use_docscopeocr):
                 image_list.append((label, frame))
         else:
             try:
-                img = load_image(file.name)
                 label = f"Image {idx+1}:"
                 image_list.append((label, img))
             except Exception as e:
@@ -153,42 +146,28 @@ def model_inference(text, files, history, use_docscopeocr):
         yield buffer
 # Gradio Interface
-def chat_interface(text, files, use_docscopeocr, history):
-    if text is None and files is None:
-        return "Error: Please input a text query or provide files."
-    return model_inference(text, files, history, use_docscopeocr)
 examples = [
-    {"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]},
-    {"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]},
-    {"text": "OCR the Image", "files": ["rolm/3.jpeg"]},
-    {"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]},
 ]
-with gr.Blocks(theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **DocScope OCR `VL/OCR`**")
-    with gr.Row():
-        text_input = gr.Textbox(label="Query Input", placeholder="Input your query here.")
-        file_input = gr.File(label="Upload Files", file_count="multiple", file_types=["image", "video", "pdf"])
-        use_docscopeocr = gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")
-    chat = gr.Chatbot()
-    submit_btn = gr.Button("Submit")
-    stop_btn = gr.Button("Stop Generation")
-    def submit(text, files, use_docscopeocr, history):
-        if not history:
-            history = []
-        history.append({"role": "user", "content": text})
-        return history, gr.update(interactive=False), gr.update(interactive=True)
-    def generate(history, text, files, use_docscopeocr):
-        if not history:
-            history = []
-        for response in model_inference(text, files, history, use_docscopeocr):
-            history.append({"role": "assistant", "content": response})
-            yield history
-    submit_btn.click(submit, [text_input, file_input, use_docscopeocr, chat], [chat, submit_btn, stop_btn])
-    submit_btn.click(generate, [chat, text_input, file_input, use_docscopeocr], chat)
 demo.launch(debug=True, ssr_mode=False)

     TextIteratorStreamer,
 )
 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
 # Main Inference Function
 @spaces.GPU
+def model_inference(message, history, use_docscopeocr):
+    text = message["text"].strip()
+    files = message.get("files", [])
     if not text and not files:
+        yield "Error: Please input a text query or provide files (images or videos)."
         return
+    # Process files: images and videos only
     image_list = []
+    for idx, file in enumerate(files):
+        if file.lower().endswith((".mp4", ".avi", ".mov")):
+            frames = downsample_video(file)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
                 image_list.append((label, frame))
         else:
             try:
+                img = load_image(file)
                 label = f"Image {idx+1}:"
                 image_list.append((label, img))
             except Exception as e:
         yield buffer
 # Gradio Interface
 examples = [
+    [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
+    [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
+    [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
+    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
+demo = gr.ChatInterface(
+    fn=model_inference,
+    description="# **DocScope OCR `VL/OCR`**",
+    examples=examples,
+    textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video"],
+        file_count="multiple",
+        placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
+    ),
+    stop_btn="Stop Generation",
+    multimodal=True,
+    cache_examples=False,
+    theme="bethecloud/storj_theme",
+    additional_inputs=[gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")],
+)
 demo.launch(debug=True, ssr_mode=False)