Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ import cv2
|
|
| 16 |
from transformers import (
|
| 17 |
Qwen2VLForConditionalGeneration,
|
| 18 |
Qwen2_5_VLForConditionalGeneration,
|
|
|
|
| 19 |
AutoProcessor,
|
| 20 |
TextIteratorStreamer,
|
| 21 |
)
|
|
@@ -55,6 +56,15 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
| 55 |
torch_dtype=torch.float16
|
| 56 |
).to(device).eval()
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
def downsample_video(video_path):
|
| 59 |
"""
|
| 60 |
Downsamples the video to evenly spaced frames.
|
|
@@ -95,6 +105,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 95 |
elif model_name == "Nanonets-OCR-s":
|
| 96 |
processor = processor_v
|
| 97 |
model = model_v
|
|
|
|
|
|
|
|
|
|
| 98 |
else:
|
| 99 |
yield "Invalid model selected."
|
| 100 |
return
|
|
@@ -126,6 +139,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 126 |
buffer = ""
|
| 127 |
for new_text in streamer:
|
| 128 |
buffer += new_text
|
|
|
|
| 129 |
time.sleep(0.01)
|
| 130 |
yield buffer
|
| 131 |
|
|
@@ -148,6 +162,9 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 148 |
elif model_name == "Nanonets-OCR-s":
|
| 149 |
processor = processor_v
|
| 150 |
model = model_v
|
|
|
|
|
|
|
|
|
|
| 151 |
else:
|
| 152 |
yield "Invalid model selected."
|
| 153 |
return
|
|
@@ -190,6 +207,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 190 |
buffer = ""
|
| 191 |
for new_text in streamer:
|
| 192 |
buffer += new_text
|
|
|
|
| 193 |
time.sleep(0.01)
|
| 194 |
yield buffer
|
| 195 |
|
|
@@ -200,6 +218,7 @@ image_examples = [
|
|
| 200 |
]
|
| 201 |
|
| 202 |
video_examples = [
|
|
|
|
| 203 |
["Identify the main actions in the cartoon video", "videos/2.mp4"]
|
| 204 |
]
|
| 205 |
|
|
@@ -244,7 +263,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 244 |
with gr.Column():
|
| 245 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
| 246 |
model_choice = gr.Radio(
|
| 247 |
-
choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR"],
|
| 248 |
label="Select Model",
|
| 249 |
value="Nanonets-OCR-s"
|
| 250 |
)
|
|
@@ -253,6 +272,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 253 |
gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
|
| 254 |
gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
|
| 255 |
gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
|
|
|
|
| 256 |
|
| 257 |
image_submit.click(
|
| 258 |
fn=generate_image,
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
Qwen2VLForConditionalGeneration,
|
| 18 |
Qwen2_5_VLForConditionalGeneration,
|
| 19 |
+
AutoModelForImageTextToText,
|
| 20 |
AutoProcessor,
|
| 21 |
TextIteratorStreamer,
|
| 22 |
)
|
|
|
|
| 56 |
torch_dtype=torch.float16
|
| 57 |
).to(device).eval()
|
| 58 |
|
| 59 |
+
# Load aya-vision-8b
|
| 60 |
+
MODEL_ID_A = "CohereForAI/aya-vision-8b"
|
| 61 |
+
processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
|
| 62 |
+
model_a = AutoModelForImageTextToText.from_pretrained(
|
| 63 |
+
MODEL_ID_A,
|
| 64 |
+
trust_remote_code=True,
|
| 65 |
+
torch_dtype=torch.float16
|
| 66 |
+
).to(device).eval()
|
| 67 |
+
|
| 68 |
def downsample_video(video_path):
|
| 69 |
"""
|
| 70 |
Downsamples the video to evenly spaced frames.
|
|
|
|
| 105 |
elif model_name == "Nanonets-OCR-s":
|
| 106 |
processor = processor_v
|
| 107 |
model = model_v
|
| 108 |
+
elif model_name == "Aya-Vision":
|
| 109 |
+
processor = processor_a
|
| 110 |
+
model = model_a
|
| 111 |
else:
|
| 112 |
yield "Invalid model selected."
|
| 113 |
return
|
|
|
|
| 139 |
buffer = ""
|
| 140 |
for new_text in streamer:
|
| 141 |
buffer += new_text
|
| 142 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 143 |
time.sleep(0.01)
|
| 144 |
yield buffer
|
| 145 |
|
|
|
|
| 162 |
elif model_name == "Nanonets-OCR-s":
|
| 163 |
processor = processor_v
|
| 164 |
model = model_v
|
| 165 |
+
elif model_name == "Aya-Vision":
|
| 166 |
+
processor = processor_a
|
| 167 |
+
model = model_a
|
| 168 |
else:
|
| 169 |
yield "Invalid model selected."
|
| 170 |
return
|
|
|
|
| 207 |
buffer = ""
|
| 208 |
for new_text in streamer:
|
| 209 |
buffer += new_text
|
| 210 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 211 |
time.sleep(0.01)
|
| 212 |
yield buffer
|
| 213 |
|
|
|
|
| 218 |
]
|
| 219 |
|
| 220 |
video_examples = [
|
| 221 |
+
["Explain the Ad in Detail", "videos/1.mp4"],
|
| 222 |
["Identify the main actions in the cartoon video", "videos/2.mp4"]
|
| 223 |
]
|
| 224 |
|
|
|
|
| 263 |
with gr.Column():
|
| 264 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
| 265 |
model_choice = gr.Radio(
|
| 266 |
+
choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Aya-Vision"],
|
| 267 |
label="Select Model",
|
| 268 |
value="Nanonets-OCR-s"
|
| 269 |
)
|
|
|
|
| 272 |
gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
|
| 273 |
gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
|
| 274 |
gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
|
| 275 |
+
gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
|
| 276 |
|
| 277 |
image_submit.click(
|
| 278 |
fn=generate_image,
|