prithivMLmods commited on
Commit
428b15d
·
verified ·
1 Parent(s): 1c5b159

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -51
app.py CHANGED
@@ -13,7 +13,6 @@ from transformers import (
13
  TextIteratorStreamer,
14
  )
15
  from transformers import Qwen2_5_VLForConditionalGeneration
16
- from pdf2image import convert_from_path
17
 
18
  # Helper Functions
19
  def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
@@ -79,25 +78,19 @@ docscopeocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
79
 
80
  # Main Inference Function
81
  @spaces.GPU
82
- def model_inference(text, files, history, use_docscopeocr):
 
 
 
83
  if not text and not files:
84
- yield "Error: Please input a text query or provide files (images, videos, PDFs)."
85
  return
86
 
87
- # Process files: images, videos, PDFs
88
  image_list = []
89
- for idx, file in enumerate(files or []):
90
- if file.name.lower().endswith(".pdf"):
91
- try:
92
- pdf_images = convert_from_path(file.name)
93
- for page_num, img in enumerate(pdf_images, start=1):
94
- label = f"PDF {idx+1} Page {page_num}:"
95
- image_list.append((label, img))
96
- except Exception as e:
97
- yield f"Error converting PDF: {str(e)}"
98
- return
99
- elif file.name.lower().endswith((".mp4", ".avi", ".mov")):
100
- frames = downsample_video(file.name)
101
  if not frames:
102
  yield "Error: Could not extract frames from the video."
103
  return
@@ -106,7 +99,7 @@ def model_inference(text, files, history, use_docscopeocr):
106
  image_list.append((label, frame))
107
  else:
108
  try:
109
- img = load_image(file.name)
110
  label = f"Image {idx+1}:"
111
  image_list.append((label, img))
112
  except Exception as e:
@@ -153,42 +146,28 @@ def model_inference(text, files, history, use_docscopeocr):
153
  yield buffer
154
 
155
  # Gradio Interface
156
- def chat_interface(text, files, use_docscopeocr, history):
157
- if text is None and files is None:
158
- return "Error: Please input a text query or provide files."
159
- return model_inference(text, files, history, use_docscopeocr)
160
-
161
  examples = [
162
- {"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]},
163
- {"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]},
164
- {"text": "OCR the Image", "files": ["rolm/3.jpeg"]},
165
- {"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]},
166
  ]
167
 
168
- with gr.Blocks(theme="bethecloud/storj_theme") as demo:
169
- gr.Markdown("# **DocScope OCR `VL/OCR`**")
170
- with gr.Row():
171
- text_input = gr.Textbox(label="Query Input", placeholder="Input your query here.")
172
- file_input = gr.File(label="Upload Files", file_count="multiple", file_types=["image", "video", "pdf"])
173
- use_docscopeocr = gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")
174
- chat = gr.Chatbot()
175
- submit_btn = gr.Button("Submit")
176
- stop_btn = gr.Button("Stop Generation")
177
-
178
- def submit(text, files, use_docscopeocr, history):
179
- if not history:
180
- history = []
181
- history.append({"role": "user", "content": text})
182
- return history, gr.update(interactive=False), gr.update(interactive=True)
183
-
184
- def generate(history, text, files, use_docscopeocr):
185
- if not history:
186
- history = []
187
- for response in model_inference(text, files, history, use_docscopeocr):
188
- history.append({"role": "assistant", "content": response})
189
- yield history
190
-
191
- submit_btn.click(submit, [text_input, file_input, use_docscopeocr, chat], [chat, submit_btn, stop_btn])
192
- submit_btn.click(generate, [chat, text_input, file_input, use_docscopeocr], chat)
193
 
194
  demo.launch(debug=True, ssr_mode=False)
 
13
  TextIteratorStreamer,
14
  )
15
  from transformers import Qwen2_5_VLForConditionalGeneration
 
16
 
17
  # Helper Functions
18
  def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
 
78
 
79
  # Main Inference Function
80
  @spaces.GPU
81
+ def model_inference(message, history, use_docscopeocr):
82
+ text = message["text"].strip()
83
+ files = message.get("files", [])
84
+
85
  if not text and not files:
86
+ yield "Error: Please input a text query or provide files (images or videos)."
87
  return
88
 
89
+ # Process files: images and videos only
90
  image_list = []
91
+ for idx, file in enumerate(files):
92
+ if file.lower().endswith((".mp4", ".avi", ".mov")):
93
+ frames = downsample_video(file)
 
 
 
 
 
 
 
 
 
94
  if not frames:
95
  yield "Error: Could not extract frames from the video."
96
  return
 
99
  image_list.append((label, frame))
100
  else:
101
  try:
102
+ img = load_image(file)
103
  label = f"Image {idx+1}:"
104
  image_list.append((label, img))
105
  except Exception as e:
 
146
  yield buffer
147
 
148
  # Gradio Interface
 
 
 
 
 
149
  examples = [
150
+ [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
151
+ [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
152
+ [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
153
+ [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
154
  ]
155
 
156
+ demo = gr.ChatInterface(
157
+ fn=model_inference,
158
+ description="# **DocScope OCR `VL/OCR`**",
159
+ examples=examples,
160
+ textbox=gr.MultimodalTextbox(
161
+ label="Query Input",
162
+ file_types=["image", "video"],
163
+ file_count="multiple",
164
+ placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
165
+ ),
166
+ stop_btn="Stop Generation",
167
+ multimodal=True,
168
+ cache_examples=False,
169
+ theme="bethecloud/storj_theme",
170
+ additional_inputs=[gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")],
171
+ )
 
 
 
 
 
 
 
 
 
172
 
173
  demo.launch(debug=True, ssr_mode=False)