Spaces:

fsommers
/

autofinance-us

Running

App Files Files Community

fsommers commited on May 9, 2024

Commit

2493114

unverified ·

1 Parent(s): fcceed0

Initial version

Browse files

Files changed (3) hide show

app.py +140 -0
packages.txt +2 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+from PIL import Image
+import torch
+import torch.nn.functional as F
+import pytesseract
+import plotly.express as px
+from torch.utils.data import Dataset, DataLoader,  Subset
+import os
+import io
+import pytesseract
+import fitz
+from typing import List
+import json
+import sys
+from pathlib import Path
+from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TOKENIZER = "microsoft/layoutlmv3-base"
+MODEL_NAME = "fsommers/layoutlmv3-autofinance-classification-us-v01"
+TESS_OPTIONS = "--psm 3" # Automatic page segmentation for Tesseract
+@st.cache_resource
+def create_ocr_reader():
+    def scale_bounding_box(box: List[int], w_scale: float = 1.0, h_scale: float = 1.0):
+        return [
+            int(box[0] * w_scale),
+            int(box[1] * h_scale),
+            int(box[2] * w_scale),
+            int(box[3] * h_scale)
+        ]
+    def ocr_page(image) -> dict:
+        """
+        OCR a given image. Return a dictionary of words and the bounding boxes
+        for each word. For each word, there is a corresponding bounding box.
+        """
+        ocr_df = pytesseract.image_to_data(image, output_type='data.frame', config=TESS_OPTIONS)
+        ocr_df = ocr_df.dropna().reset_index(drop=True)
+        float_cols = ocr_df.select_dtypes('float').columns
+        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
+        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
+        ocr_df = ocr_df.dropna().reset_index(drop=True)
+        words = list(ocr_df.text)
+        words = [str(w) for w in words]
+        coordinates = ocr_df[['left', 'top', 'width', 'height']]
+        boxes = []
+        for i, row in coordinates.iterrows():
+            x, y, w, h = tuple(row)
+            actual_box = [x, y, x + w, y + h]
+            boxes.append(actual_box)
+        assert len(words) == len(boxes)
+        return {"bbox": boxes, "words": words}
+    def prepare_image(image):
+        ocr_data = ocr_page(image)
+        width, height = image.size
+        width_scale = 1000 / width
+        height_scale = 1000 / height
+        words = []
+        boxes = []
+        for w, b in zip(ocr_data["words"], ocr_data["bbox"]):
+            words.append(w)
+            boxes.append(scale_bounding_box(b, width_scale, height_scale))
+        assert len(words) == len(boxes)
+        for bo in boxes:
+            for z in bo:
+                if (z > 1000):
+                    raise
+        return words, boxes
+    return prepare_image
+@st.cache_resource
+def create_model():
+    model = LayoutLMv3ForSequenceClassification.from_pretrained(MODEL_NAME)
+    return model.eval().to(DEVICE)
+@st.cache_resource
+def create_processor():
+    feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
+    tokenizer = LayoutLMv3TokenizerFast.from_pretrained(TOKENIZER)
+    return LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+def predict(image, reader, processor: LayoutLMv3Processor, model: LayoutLMv3ForSequenceClassification):
+    words, boxes = reader(image)
+    encoding = processor(
+        image,
+        words,
+        boxes=boxes,
+        max_length=512,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt"
+    )
+    with torch.inference_mode():
+        output = model(
+            input_ids=encoding["input_ids"].to(DEVICE),
+            attention_mask=encoding["attention_mask"].to(DEVICE),
+            bbox=encoding["bbox"].to(DEVICE),
+            pixel_values=encoding["pixel_values"].to(DEVICE)
+        )
+        logits = output.logits
+        predicted_class = logits.argmax()
+        probabilities = F.softmax(logits, dim=-1).flatten().tolist()
+        return predicted_class.detach().item(), probabilities
+reader = create_ocr_reader()
+processor = create_processor()
+model = create_model()
+uploaded_file = st.file_uploader("Choose a JPG file", ["jpg", "png"])
+if uploaded_file is not None:
+    bytes_data = io.BytesIO(uploaded_file.read())
+    image = Image.open(bytes_data)
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    predicted, probabilities = predict(image, reader, processor, model)
+    predicted_label = model.config.id2label[predicted]
+    st.markdown(f"Predicted Label: {predicted_label}")
+    df = pd.DataFrame({
+        "Label": list(model.config.id2label.values()),
+        "Probability": probabilities
+    })
+    fig = px.bar(df, x="Label", y="Probability")
+    st.plotly_chart(fig, use_container_width=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr
2	+ tesseract-ocr-eng-best

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas==2.2.2
+huggingface-hub==0.23.0
+Pillow==10.3.0
+plotly-express==0.4.1
+PyMuPDF==1.24.3
+pytesseract==0.3.10
+torch==2.2.2
+transformers==4.40.2