|
import os |
|
from pathlib import Path |
|
import pandas as pd |
|
import gradio as gr |
|
from collections import OrderedDict |
|
from PIL import Image, ImageDraw, ImageFont |
|
from io import BytesIO |
|
import PyPDF2 |
|
import pdf2image |
|
|
|
MAX_PAGES = 50 |
|
MAX_PDF_SIZE = 100000000 |
|
MIN_WIDTH, MIN_HEIGHT = 150, 150 |
|
|
|
|
|
def equal_image_grid(images): |
|
def compute_grid(n, max_cols=6): |
|
equalDivisor = int(n**0.5) |
|
cols = min(equalDivisor, max_cols) |
|
rows = equalDivisor |
|
if rows * cols >= n: |
|
return rows, cols |
|
cols += 1 |
|
if rows * cols >= n: |
|
return rows, cols |
|
while rows * cols < n: |
|
rows += 1 |
|
return rows, cols |
|
|
|
|
|
rows, cols = compute_grid(len(images)) |
|
|
|
|
|
images = [im for im in images if (im.height > 0) and (im.width > 0)] |
|
|
|
min_width = min(im.width for im in images) |
|
images = [im.resize((min_width, int(im.height * min_width / im.width)), resample=Image.BICUBIC) for im in images] |
|
|
|
w, h = max([img.size[0] for img in images]), max([img.size[1] for img in images]) |
|
|
|
grid = Image.new("RGB", size=(cols * w, rows * h)) |
|
grid_w, grid_h = grid.size |
|
|
|
for i, img in enumerate(images): |
|
grid.paste(img, box=(i % cols * w, i // cols * h)) |
|
return grid |
|
|
|
|
|
def add_pagenumbers(im_list, height_scale=40): |
|
def add_pagenumber(image, i): |
|
width, height = image.size |
|
draw = ImageDraw.Draw(image) |
|
fontsize = int((width * height) ** (0.5) / height_scale) |
|
font = ImageFont.truetype("Arial.ttf", fontsize) |
|
margin = int(2 * fontsize) |
|
draw.text( |
|
(width - margin, height - margin), |
|
str(i + 1), |
|
fill="#D00917", |
|
font=font, |
|
spacing=4, |
|
align="right", |
|
) |
|
|
|
for i, image in enumerate(im_list): |
|
add_pagenumber(image, i) |
|
|
|
|
|
def pdf_to_grid(pdf_path): |
|
reader = PyPDF2.PdfReader(pdf_path) |
|
reached_page_limit = False |
|
images = [] |
|
try: |
|
for p, page in enumerate(reader.pages): |
|
if reached_page_limit: |
|
break |
|
for image in page.images: |
|
im = Image.open(BytesIO(image.data)) |
|
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
|
continue |
|
images.append(im) |
|
except Exception as e: |
|
print(f"{pdf_path} PyPDF get_images {e}") |
|
images = pdf2image.convert_from_path(pdf_path) |
|
|
|
|
|
|
|
|
|
if len(images) == 0: |
|
return None |
|
add_pagenumbers(images) |
|
return equal_image_grid(images) |
|
|
|
|
|
def main(complexity, evidence, form, operation, type): |
|
|
|
|
|
query = " and ".join( |
|
[ |
|
f"{cat}_{val} == {True}" |
|
for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type]) |
|
if val |
|
] |
|
) |
|
results = DIAGNOSTIC_TEST.query(query) |
|
if len(results) == 0: |
|
return f"No results found for query {query}", "", "", "", "" |
|
|
|
for i, sample in results.sample(frac=1).iterrows(): |
|
if not sample['nhash']: |
|
continue |
|
print("Sampled: ", sample["nhash"]) |
|
|
|
|
|
PDF, grid = None, None |
|
pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf") |
|
if not os.path.exists(pdf_path): |
|
continue |
|
PDF = pdf_path |
|
grid = pdf_to_grid(pdf_path) |
|
if not grid: |
|
continue |
|
question, answer = sample["label"] |
|
|
|
return label, grid, PDF |
|
|
|
_CLASSES = [ |
|
"letter", |
|
"form", |
|
"email", |
|
"handwritten", |
|
"advertisement", |
|
"scientific report", |
|
"scientific publication", |
|
"specification", |
|
"file folder", |
|
"news article", |
|
"budget", |
|
"invoice", |
|
"presentation", |
|
"questionnaire", |
|
"resume", |
|
"memo", |
|
] |
|
|
|
|
|
|
|
|
|
DATASETS = { |
|
'rvl_cdip': load_dataset( |
|
"bdpc/rvl_cdip_mp", |
|
split="test"), |
|
'rvl_cdip_N': load_dataset( |
|
"bdpc/rvl_cdip_mp", |
|
split="test") |
|
} |
|
|
|
meta_cats = {'dataset': ['rvl_cdip', 'rvl_cdip_N'], |
|
'label': _CLASSES |
|
} |
|
sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()] |
|
slider_defaults = [slider.value for slider in sliders] |
|
|
|
outputs = [ |
|
gr.Textbox(label="label"), |
|
gr.Image(label="image grid of PDF"), |
|
gr.File(label="PDF"), |
|
] |
|
|
|
iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize PDF samples from multi-page (PDF) document classification datasets", title='Beyond Document Page Classification: Examples') |
|
iface.launch(share=True) |