Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 8 days ago

Commit

d2b791d

verified ·

1 Parent(s): 3c2995e

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -109

app.py CHANGED Viewed

@@ -11,14 +11,13 @@ from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoModelForImageTextToText,
 )
 from transformers import Qwen2_5_VLForConditionalGeneration
 # ---------------------------
 # Helper Functions
 # ---------------------------
-def progress_bar_html(label: str, primary_color: str = "#FF0000", secondary_color: str = "#FF4500") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
     Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
@@ -65,7 +64,7 @@ def downsample_video(video_path):
 # Model and Processor Setup
 # Qwen2VL OCR (default branch)
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct" #[or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
@@ -73,13 +72,6 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Aya-Vision branch (for @aya-vision and @video-infer)
-AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
-aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
-aya_model = AutoModelForImageTextToText.from_pretrained(
-    AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
-)
 # RolmOCR branch (@RolmOCR)
 ROLMOCR_MODEL_ID = "reducto/RolmOCR"
 rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
@@ -95,93 +87,6 @@ def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
-    # ---------------------------
-    # Aya-Vision Video Inference (@video-infer)
-    # ---------------------------
-    if text.lower().startswith("@video-infer"):
-        prompt = text[len("@video-infer"):].strip()
-        if not files:
-            yield "Error: Please provide a video for the @video-infer feature."
-            return
-        video_path = files[0]
-        frames = downsample_video(video_path)
-        if not frames:
-            yield "Error: Could not extract frames from the video."
-            return
-        # Build the message with the text prompt followed by each frame (with timestamp label).
-        content_list = [{"type": "text", "text": prompt}]
-        for frame, timestamp in frames:
-            content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
-            content_list.append({"type": "image", "image": frame})
-        messages = [{"role": "user", "content": content_list}]
-        inputs = aya_processor.apply_chat_template(
-            messages,
-            padding=True,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(aya_model.device)
-        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(
-            inputs,
-            streamer=streamer,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=0.3
-        )
-        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing video with Aya-Vision-8b")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Aya-Vision Image Inference (@aya-vision)
-    if text.lower().startswith("@aya-vision"):
-        text_prompt = text[len("@aya-vision"):].strip()
-        if not files:
-            yield "Error: Please provide an image for the @aya-vision feature."
-            return
-        image = load_image(files[0])
-        yield progress_bar_html("Processing with Aya-Vision-8b")
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": text_prompt},
-            ],
-        }]
-        inputs = aya_processor.apply_chat_template(
-            messages,
-            padding=True,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(aya_model.device)
-        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(
-            inputs,
-            streamer=streamer,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=0.3
-        )
-        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
     # RolmOCR Inference (@RolmOCR)
     if text.lower().startswith("@rolmocr"):
         # Remove the tag from the query.
@@ -239,14 +144,14 @@ def model_inference(input_dict, history):
         thread.start()
         buffer = ""
         # Use a different color scheme for RolmOCR (purple-themed).
-        yield progress_bar_html("Processing with Qwen2.5VL (RolmOCR)", primary_color="#4B0082", secondary_color="#9370DB")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
         return
     # Default Inference: Qwen2VL OCR
     # Process files: support multiple images.
     if len(files) > 1:
@@ -294,26 +199,18 @@ examples = [
     [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
     [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
-    [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
-    [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
-    [{"text": "@video-infer Explain what is happening in this video briefly by understanding", "files": ["examples/oreo.mp4"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
-    [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
-    [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
-    [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
-    [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
-    [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `@RolmOCR, @aya-vision for image, @video-infer for video`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image", "video"],
         file_count="multiple",
-        placeholder="Use tag @RolmOCR @aya-vision for Image, @video-infer for video, or leave blank for default Qwen2VL OCR"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers import Qwen2_5_VLForConditionalGeneration
 # ---------------------------
 # Helper Functions
 # ---------------------------
+def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
     Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
 # Model and Processor Setup
 # Qwen2VL OCR (default branch)
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # [or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
 # RolmOCR branch (@RolmOCR)
 ROLMOCR_MODEL_ID = "reducto/RolmOCR"
 rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
     # RolmOCR Inference (@RolmOCR)
     if text.lower().startswith("@rolmocr"):
         # Remove the tag from the query.
         thread.start()
         buffer = ""
         # Use a different color scheme for RolmOCR (purple-themed).
+        yield progress_bar_html("Processing with Qwen2.5VL (RolmOCR)")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
         return
     # Default Inference: Qwen2VL OCR
     # Process files: support multiple images.
     if len(files) > 1:
     [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
     [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR `@RolmOCR` and Default Qwen2VL OCR**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image", "video"],
         file_count="multiple",
+        placeholder="Use tag @RolmOCR for RolmOCR, or leave blank for default Qwen2VL OCR"
     ),
     stop_btn="Stop Generation",
     multimodal=True,