Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 4

Commit

b3a3e40

verified ·

1 Parent(s): 2df2030

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -407

app.py CHANGED Viewed

@@ -1,413 +1,182 @@
 import gradio as gr
-import spaces
 import torch
-from diffusers import AutoencoderKL, TCDScheduler
-from diffusers.models.model_loading_utils import load_state_dict
-from huggingface_hub import hf_hub_download
-from controlnet_union import ControlNetModel_Union
-from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
-from PIL import Image, ImageDraw
 import numpy as np
-config_file = hf_hub_download(
-    "xinsir/controlnet-union-sdxl-1.0",
-    filename="config_promax.json",
-)
-config = ControlNetModel_Union.load_config(config_file)
-controlnet_model = ControlNetModel_Union.from_config(config)
-model_file = hf_hub_download(
-    "xinsir/controlnet-union-sdxl-1.0",
-    filename="diffusion_pytorch_model_promax.safetensors",
-)
-sstate_dict = load_state_dict(model_file)
-model, _, _, _, _ = ControlNetModel_Union._load_pretrained_model(
-    controlnet_model, sstate_dict, model_file, "xinsir/controlnet-union-sdxl-1.0"
 )
-model.to(device="cuda", dtype=torch.float16)
-vae = AutoencoderKL.from_pretrained(
-    "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
-).to("cuda")
-# Initialize both pipelines and store them in a dictionary
-pipelines = {
-    "RealVisXL V5.0 Lightning": StableDiffusionXLFillPipeline.from_pretrained(
-        "SG161222/RealVisXL_V5.0_Lightning",
-        torch_dtype=torch.float16,
-        vae=vae,
-        controlnet=model,
-        variant="fp16",
-    ).to("cuda"),
-    "RealVisXL V4.0 Lightning": StableDiffusionXLFillPipeline.from_pretrained(
-        "SG161222/RealVisXL_V4.0_Lightning",
-        torch_dtype=torch.float16,
-        vae=vae,
-        controlnet=model,
-        variant="fp16",
-    ).to("cuda"),
-}
-for pipe in pipelines.values():
-    pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
-def prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
-    target_size = (width, height)
-    # Calculate the scaling factor to fit the image within the target size
-    scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
-    new_width = int(image.width * scale_factor)
-    new_height = int(image.height * scale_factor)
-    # Resize the source image to fit within target size
-    source = image.resize((new_width, new_height), Image.LANCZOS)
-    # Apply resize option using percentages
-    if resize_option == "Full":
-        resize_percentage = 100
-    elif resize_option == "50%":
-        resize_percentage = 50
-    elif resize_option == "33%":
-        resize_percentage = 33
-    elif resize_option == "25%":
-        resize_percentage = 25
-    else:  # Custom
-        resize_percentage = custom_resize_percentage
-    # Calculate new dimensions based on percentage
-    resize_factor = resize_percentage / 100
-    new_width = int(source.width * resize_factor)
-    new_height = int(source.height * resize_factor)
-    # Ensure minimum size of 64 pixels
-    new_width = max(new_width, 64)
-    new_height = max(new_height, 64)
-    # Resize the image
-    source = source.resize((new_width, new_height), Image.LANCZOS)
-    # Calculate the overlap in pixels based on the percentage
-    overlap_x = int(new_width * (overlap_percentage / 100))
-    overlap_y = int(new_height * (overlap_percentage / 100))
-    # Ensure minimum overlap of 1 pixel
-    overlap_x = max(overlap_x, 1)
-    overlap_y = max(overlap_y, 1)
-    # Calculate margins based on alignment
-    if alignment == "Middle":
-        margin_x = (target_size[0] - new_width) // 2
-        margin_y = (target_size[1] - new_height) // 2
-    elif alignment == "Left":
-        margin_x = 0
-        margin_y = (target_size[1] - new_height) // 2
-    elif alignment == "Right":
-        margin_x = target_size[0] - new_width
-        margin_y = (target_size[1] - new_height) // 2
-    elif alignment == "Top":
-        margin_x = (target_size[0] - new_width) // 2
-        margin_y = 0
-    elif alignment == "Bottom":
-        margin_x = (target_size[0] - new_width) // 2
-        margin_y = target_size[1] - new_height
-    # Adjust margins to eliminate gaps
-    margin_x = max(0, min(margin_x, target_size[0] - new_width))
-    margin_y = max(0, min(margin_y, target_size[1] - new_height))
-    # Create a new background image and paste the resized source image
-    background = Image.new('RGB', target_size, (255, 255, 255))
-    background.paste(source, (margin_x, margin_y))
-    # Create the mask
-    mask = Image.new('L', target_size, 255)
-    mask_draw = ImageDraw.Draw(mask)
-    # Calculate overlap areas
-    white_gaps_patch = 2
-    left_overlap = margin_x + overlap_x if overlap_left else margin_x + white_gaps_patch
-    right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width - white_gaps_patch
-    top_overlap = margin_y + overlap_y if overlap_top else margin_y + white_gaps_patch
-    bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height - white_gaps_patch
-    if alignment == "Left":
-        left_overlap = margin_x + overlap_x if overlap_left else margin_x
-    elif alignment == "Right":
-        right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width
-    elif alignment == "Top":
-        top_overlap = margin_y + overlap_y if overlap_top else margin_y
-    elif alignment == "Bottom":
-        bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height
-    # Draw the mask
-    mask_draw.rectangle([
-        (left_overlap, top_overlap),
-        (right_overlap, bottom_overlap)
-    ], fill=0)
-    return background, mask
-@spaces.GPU(duration=28)
-def infer(image, width, height, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom, selected_model):
-    background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)
-    cnet_image = background.copy()
-    cnet_image.paste(0, (0, 0), mask)
-    final_prompt = f"{prompt_input} , high quality, 4k"
-    # Access the selected pipeline from the dictionary
-    pipe = pipelines[selected_model]
-    (
-        prompt_embeds,
-        negative_prompt_embeds,
-        pooled_prompt_embeds,
-        negative_pooled_prompt_embeds,
-    ) = pipe.encode_prompt(final_prompt, "cuda", True)
-    # Generate the image
-    for image in pipe(
-        prompt_embeds=prompt_embeds,
-        negative_prompt_embeds=negative_prompt_embeds,
-        pooled_prompt_embeds=pooled_prompt_embeds,
-        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        image=cnet_image,
-        num_inference_steps=num_inference_steps
-    ):
-        pass  # Wait for the generation to complete
-    generated_image = image  # Get the last image
-    generated_image = generated_image.convert("RGBA")
-    cnet_image.paste(generated_image, (0, 0), mask)
-    return cnet_image
-def clear_result():
-    """Clears the result Image."""
-    return gr.update(value=None)
-def preload_presets(target_ratio, ui_width, ui_height):
-    """Updates the width and height sliders based on the selected aspect ratio."""
-    if target_ratio == "9:16":
-        changed_width = 720
-        changed_height = 1280
-        return changed_width, changed_height, gr.update()
-    elif target_ratio == "16:9":
-        changed_width = 1280
-        changed_height = 720
-        return changed_width, changed_height, gr.update()
-    elif target_ratio == "1:1":
-        changed_width = 1024
-        changed_height = 1024
-        return changed_width, changed_height, gr.update()
-    elif target_ratio == "Custom":
-        return ui_width, ui_height, gr.update(open=True)
-def select_the_right_preset(user_width, user_height):
-    if user_width == 720 and user_height == 1280:
-        return "9:16"
-    elif user_width == 1280 and user_height == 720:
-        return "16:9"
-    elif user_width == 1024 and user_height == 1024:
-        return "1:1"
     else:
-        return "Custom"
-def toggle_custom_resize_slider(resize_option):
-    return gr.update(visible=(resize_option == "Custom"))
-def update_history(new_image, history):
-    """Updates the history gallery with the new image."""
-    if history is None:
-        history = []
-    history.insert(0, new_image)
-    return history
-# CSS and title (unchanged)
-css = """
-h1 {
-  text-align: center;
-  display: block;
-}
-"""
-title = """<h1>Image Outpaint Expand 🪃</h1>"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column():
-        gr.HTML(title)
-        with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(
-                    type="pil",
-                    label="Input Image"
-                )
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        prompt_input = gr.Textbox(label="Prompt (Optional)")
-                    with gr.Column(scale=1):
-                        run_button = gr.Button("Re-Generate Image / Diffusers Outpaint Image Lightning / Lightning v4, v5", elem_classes="submit-btn")
-                with gr.Row():
-                    model_selector = gr.Dropdown(
-                        label="Select Model",
-                        choices=list(pipelines.keys()),
-                        value="RealVisXL V5.0 Lightning",
-                    )
-                with gr.Row():
-                    target_ratio = gr.Radio(
-                        label="Expected Ratio",
-                        choices=["9:16", "16:9", "1:1", "Custom"],
-                        value="9:16",
-                        scale=2
-                    )
-                    alignment_dropdown = gr.Dropdown(
-                        choices=["Middle", "Left", "Right", "Top", "Bottom"],
-                        value="Middle",
-                        label="Alignment"
-                    )
-                with gr.Accordion(label="Advanced settings", open=False) as settings_panel:
-                    with gr.Column():
-                        with gr.Row():
-                            width_slider = gr.Slider(
-                                label="Width",
-                                minimum=720,
-                                maximum=1536,
-                                step=8,
-                                value=720,
-                            )
-                            height_slider = gr.Slider(
-                                label="Height",
-                                minimum=720,
-                                maximum=1536,
-                                step=8,
-                                value=1280,
-                            )
-                        num_inference_steps = gr.Slider(label="Steps", minimum=4, maximum=12, step=1, value=8)
-                        with gr.Group():
-                            overlap_percentage = gr.Slider(
-                                label="Mask overlap (%)",
-                                minimum=1,
-                                maximum=50,
-                                value=10,
-                                step=1
-                            )
-                            with gr.Row():
-                                overlap_top = gr.Checkbox(label="Overlap Top", value=True)
-                                overlap_right = gr.Checkbox(label="Overlap Right", value=True)
-                            with gr.Row():
-                                overlap_left = gr.Checkbox(label="Overlap Left", value=True)
-                                overlap_bottom = gr.Checkbox(label="Overlap Bottom", value=True)
-                        with gr.Row():
-                            resize_option = gr.Radio(
-                                label="Resize input image",
-                                #choices=["Full", "50%", "33%", "25%", "Custom"],
-                                choices=["Full", "50%", "33%", "25%", "Custom"],
-                                value="Full"
-                            )
-                            custom_resize_percentage = gr.Slider(
-                                label="Custom resize (%)",
-                                minimum=1,
-                                maximum=100,
-                                step=1,
-                                value=50,
-                                visible=False
-                            )
-                gr.Examples(
-                    examples=[
-                        ["./examples/3.jpg", 1024, 1024, "Top"],
-                        ["./examples/4.jpg", 1024, 1024, "Middle"],
-                        ["./examples/2.png", 720, 1280, "Left"],
-                        ["./examples/1.png", 1280, 720, "Bottom"],
-                        ["./examples/5.jpg", 1024, 1024, "Bottom"],
-                    ],
-                    inputs=[input_image, width_slider, height_slider, alignment_dropdown],
-                )
-            with gr.Column():
-                result = gr.Image(
-                    interactive=False,
-                    label="Generated Image",
-                    format="png",
-                )
-                history_gallery = gr.Gallery(label="History", columns=6, object_fit="contain", interactive=False)
-    target_ratio.change(
-        fn=preload_presets,
-        inputs=[target_ratio, width_slider, height_slider],
-        outputs=[width_slider, height_slider, settings_panel],
-        queue=False
-    )
-    width_slider.change(
-        fn=select_the_right_preset,
-        inputs=[width_slider, height_slider],
-        outputs=[target_ratio],
-        queue=False
-    )
-    height_slider.change(
-        fn=select_the_right_preset,
-        inputs=[width_slider, height_slider],
-        outputs=[target_ratio],
-        queue=False
-    )
-    resize_option.change(
-        fn=toggle_custom_resize_slider,
-        inputs=[resize_option],
-        outputs=[custom_resize_percentage],
-        queue=False
-    )
-    run_button.click(
-        fn=clear_result,
-        inputs=None,
-        outputs=result,
-    ).then(
-        fn=infer,
-        inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps,
-                resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
-                overlap_left, overlap_right, overlap_top, overlap_bottom, model_selector],
-        outputs=result,
-    ).then(
-        fn=lambda x, history: update_history(x, history),
-        inputs=[result, history_gallery],
-        outputs=history_gallery,
-    )
-    prompt_input.submit(
-        fn=clear_result,
-        inputs=None,
-        outputs=result,
-    ).then(
-        fn=infer,
-        inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps,
-                resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
-                overlap_left, overlap_right, overlap_top, overlap_bottom, model_selector],
-        outputs=result,
-    ).then(
-        fn=lambda x, history: update_history(x, history),
-        inputs=[result, history_gallery],
-        outputs=history_gallery,
-    )
-demo.queue(max_size=20).launch(share=False, ssr_mode=False, show_error=True)

 import gradio as gr
+from transformers.image_utils import load_image
+from threading import Thread
+import time
 import torch
+import spaces
+import cv2
 import numpy as np
+from PIL import Image
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
+    TextIteratorStreamer,
 )
+from transformers import Qwen2_5_VLForConditionalGeneration
+from pdf2image import convert_from_path
+# Helper Functions
+def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
+    """
+    Returns an HTML snippet for a thin animated progress bar with a label.
+    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
+    """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: {secondary_color}; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: {primary_color}; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
+def downsample_video(video_path):
+    """
+    Downsamples a video file by extracting 10 evenly spaced frames.
+    Returns a list of tuples (PIL.Image, timestamp).
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+# Model and Processor Setup
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
+qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    QV_MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+ROLMOCR_MODEL_ID = "reducto/RolmOCR"
+rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
+rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    ROLMOCR_MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to("cuda").eval()
+# Main Inference Function
+@spaces.GPU
+def model_inference(message, history, use_rolmocr):
+    text = message["text"].strip()
+    files = message.get("files", [])
+    if not text and not files:
+        yield "Error: Please input a text query or provide files (images, videos, PDFs)."
+        return
+    # Process files: images, videos, PDFs
+    image_list = []
+    for idx, file in enumerate(files):
+        if file.lower().endswith(".pdf"):
+            try:
+                pdf_images = convert_from_path(file)
+                for page_num, img in enumerate(pdf_images, start=1):
+                    label = f"PDF {idx+1} Page {page_num}:"
+                    image_list.append((label, img))
+            except Exception as e:
+                yield f"Error converting PDF: {str(e)}"
+                return
+        elif file.lower().endswith((".mp4", ".avi", ".mov")):
+            frames = downsample_video(file)
+            if not frames:
+                yield "Error: Could not extract frames from the video."
+                return
+            for frame, timestamp in frames:
+                label = f"Video {idx+1} Frame {timestamp}:"
+                image_list.append((label, frame))
+        else:
+            try:
+                img = load_image(file)
+                label = f"Image {idx+1}:"
+                image_list.append((label, img))
+            except Exception as e:
+                yield f"Error loading image: {str(e)}"
+                return
+    # Build content list
+    content = [{"type": "text", "text": text}]
+    for label, img in image_list:
+        content.append({"type": "text", "text": label})
+        content.append({"type": "image", "image": img})
+    messages = [{"role": "user", "content": content}]
+    # Select processor and model
+    if use_rolmocr:
+        processor = rolmocr_processor
+        model = rolmocr_model
+        model_name = "RolmOCR"
     else:
+        processor = qwen_processor
+        model = qwen_model
+        model_name = "Qwen2VL OCR"
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    all_images = [item["image"] for item in content if item["type"] == "image"]
+    inputs = processor(
+        text=[prompt_full],
+        images=all_images if all_images else None,
+        return_tensors="pt",
+        padding=True,
+    ).to("cuda")
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    yield progress_bar_html(f"Processing with {model_name}")
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer
+# Gradio Interface
+examples = [
+    [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
+    [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
+    [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
+    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
+]
+demo = gr.ChatInterface(
+    fn=model_inference,
+    description="# **Multimodal OCR with Model Selection**",
+    examples=examples,
+    textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video", "pdf"],
+        file_count="multiple",
+        placeholder="Input your query and optionally upload image(s), video(s), or PDF(s). Select the model using the checkbox."
+    ),
+    stop_btn="Stop Generation",
+    multimodal=True,
+    cache_examples=False,
+    additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=True)],
+)
+demo.launch(debug=True)