Modular Florence2 block that can also be used with Mellon.

How to use

With Mellon

The node can be used with the default installation of Mellon using the Dynamic Block Node

Using it with code

Captioning

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<CAPTION>"  # can also be <DETAILED_CAPTION> or <MORE_DETAILED_CAPTION>
annotation_prompt = ""

output = pipe(image=image, annotation_task=annotation_task, annotation_prompt=annotation_prompt).annotations[0]
print(output)

Caption

A man and a woman writing on a white board.

Detailed Caption

In this image we can see a man and a woman holding markers in their hands. We can also see a board with some text on it.

More Detailed Caption

A man and a woman are standing in front of a whiteboard. The woman is writing on a black marker. The man is wearing a blue shirt. The whiteboard has writing on it. The writing on the whiteboard is black. The people are looking at each other. There is writing in black marker on the board. There are drawings on whiteboard behind the people.

Object Detection

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<OD>"
annotation_prompt = ""

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).images[0]
output.save("output.png")

Input	Output

Dense Region Caption

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<DENSE_REGION_CAPTION>"
annotation_prompt = ""

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).images[0]
output.save("output.png")

Input	Output

Region Proposal

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<REGION_PROPOSAL>"
annotation_prompt = ""

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).images[0]
output.save("output.png")

Input	Output

Phrase Grounding

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<CAPTION_TO_PHRASE_GROUNDING>"
annotation_prompt = "man"

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",  # can also use `mask_image` and `mask_overlay`
).images[0]
output.save("output.png")

Input	Bounding Box	Mask Image	Mask Overlay

Referring Expression Segmentation

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
annotation_prompt = "man"

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="mask_image",  # can also use `mask_overlay`
).images[0]
output.save("output.png")

Input	Mask Image	Mask Overlay

Open Vocabulary Detection

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<OPEN_VOCABULARY_DETECTION>"
annotation_prompt = "man with a beard"

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).images[0]
output.save("output.png")

Input	Output

OCR

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<OCR>"
annotation_prompt = ""

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).annotations[0]
print(output)

The Diffuser's library byHugging Face makes it easyfor developers to run imagegeneration and influenceusing state-of-the-astdiffusion models withjust a few lines of codehuman eou

OCR with region

import torch

from diffusers.modular_pipelines import ModularPipeline
from diffusers.utils import load_image


pipe = ModularPipeline.from_pretrained("OzzyGT/florence-2-block", trust_remote_code=True)
pipe.load_components(torch_dtype=torch.float16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/OzzyGT/diffusers-examples/resolve/main/florence-2/white_board_people.png"
)

annotation_task = "<OCR_WITH_REGION>"
annotation_prompt = ""

output = pipe(
    image=image,
    annotation_task=annotation_task,
    annotation_prompt=annotation_prompt,
    annotation_output_type="bounding_box",
).images[0]
output.save("output.png")

Input	Output

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support