Spaces:

justinj92
/

florence-2

Sleeping

File size: 7,092 Bytes

from typing import Tuple, Optional

import gradio as gr
import spaces
import supervision as sv
import torch
from PIL import Image
from gradio_image_prompter import ImagePrompter

from utils.annotate import annotate_with_boxes
from utils.models import load_models, run_inference, CHECKPOINTS, \
    pre_process_region_task_input, post_process_region_output
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
    CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
    IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
    TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
    IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
    DENSE_REGION_CAPTION_TASK_NAME

MARKDOWN = """
# Florence-2 🔥

Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the 
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities 
across tasks such as captioning, object detection, grounding, and segmentation.
The model takes images and task prompts as input, generating the desired results in 
text format. It uses a DaViT vision encoder to convert images into visual token 
embeddings. These are then concatenated with BERT-generated text embeddings and 
processed by a transformer-based multi-modal encoder-decoder to generate the response.
"""

EXAMPLES = [
    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
    ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
]

# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = "cuda"
MODELS, PROCESSORS = load_models(DEVICE)


@spaces.GPU
def process(
    checkpoint_dropdown,
    task_dropdown,
    image_input,
    image_prompter_input
) -> Tuple[Optional[Image.Image], Optional[str]]:
    model = MODELS[checkpoint_dropdown]
    processor = PROCESSORS[checkpoint_dropdown]
    task = TASKS[task_dropdown]

    if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        detections = sv.Detections.from_lmm(
            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
        return annotate_with_boxes(image_input, detections), None

    elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        return None, response[task]

    elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
        detections_list = []

        print(image_prompter_input)

        image_input = image_prompter_input["image"]
        for prompt in image_prompter_input["points"]:
            text = pre_process_region_task_input(
                prompt=prompt,
                resolution_wh=image_input.size
            )
            _, response = run_inference(
                model, processor, DEVICE, image_input, task, text)
            detections = sv.Detections.from_lmm(
                lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
            detections_list.append(detections)
        detections = sv.Detections.merge(detections_list=detections_list)
        detections = post_process_region_output(
            detections=detections, resolution_wh=image_input.size)

        return annotate_with_boxes(image_input, detections), None


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        checkpoint_dropdown_component = gr.Dropdown(
            choices=CHECKPOINTS,
            value=CHECKPOINTS[0],
            label="Model", info="Select a Florence 2 model to use.",
            interactive=True
        )
        task_dropdown_component = gr.Dropdown(
            choices=TASK_NAMES,
            value=TASK_NAMES[0],
            label="Task", info="Select a task to perform with the model.",
            interactive=True
        )

    with gr.Row():
        with gr.Column():
            image_input_component = gr.Image(
                type='pil', label='Upload image')
            image_prompter_input_component = ImagePrompter(
                type='pil', label='Image prompt', visible=False)
            submit_button_component = gr.Button(value='Submit', variant='primary')

        with gr.Column():
            image_output_component = gr.Image(type='pil', label='Image Output')
            text_output_component = gr.Textbox(label='Caption Output', visible=False)
    with gr.Row():
        gr.Examples(
            fn=process,
            examples=EXAMPLES,
            inputs=[
                checkpoint_dropdown_component,
                task_dropdown_component,
                image_input_component,
                image_prompter_input_component
            ],
            outputs=[
                image_output_component,
                text_output_component
            ],
            run_on_click=True
        )

    def on_dropdown_change(text):
        return [
            gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
            ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
            gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
            gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
        ]

    task_dropdown_component.change(
        on_dropdown_change,
        inputs=[task_dropdown_component],
        outputs=[
            image_input_component,
            image_prompter_input_component,
            image_output_component,
            text_output_component
        ]
    )
    submit_button_component.click(
        fn=process,
        inputs=[
            checkpoint_dropdown_component,
            task_dropdown_component,
            image_input_component,
            image_prompter_input_component
        ],
        outputs=[
            image_output_component,
            text_output_component
        ]
    )

demo.launch(debug=False, show_error=True, max_threads=1)