Spaces:

Balaprime
/

InvoiceBot

Runtime error

App Files Files Community

Balaprime commited on Jun 6

Commit

ce5499f

verified ·

1 Parent(s): b4f1e0d

Create utils.py

Browse files

Files changed (1) hide show

utils.py +107 -0

utils.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from pypdf import PdfReader
+import pandas as pd
+import re
+import torch
+# Load Mistral model from Hugging Face
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+model = AutoModelForCausalLM.from_pretrained(
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+model.eval()
+# Read PDF text
+def get_pdf_text(pdf_doc):
+    text = ""
+    reader = PdfReader(pdf_doc)
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Extract invoice data using the model
+def extracted_data(pages_data):
+    prompt = f"""Extract the following values from the text:
+invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address.
+Text: {pages_data}
+Output format:
+{{
+    'Invoice no.': '1001329',
+    'Description': 'Office Chair',
+    'Quantity': '2',
+    'Date': '5/4/2023',
+    'Unit price': '1100.00',
+    'Amount': '2200.00',
+    'Total': '2200.00',
+    'Email': '[email protected]',
+    'Phone number': '9999999999',
+    'Address': 'Mumbai, India'
+}}
+"""
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=512)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response
+# Process PDF list and build DataFrame
+def create_docs(user_pdf_list):
+    df = pd.DataFrame(columns=[
+        'Invoice no.', 'Description', 'Quantity', 'Date',
+        'Unit price', 'Amount', 'Total', 'Email',
+        'Phone number', 'Address'
+    ])
+    for file in user_pdf_list:
+        raw_text = get_pdf_text(file)
+        llm_output = extracted_data(raw_text)
+        # Try extracting JSON-like data from output
+        pattern = r'{(.+)}'
+        match = re.search(pattern, llm_output, re.DOTALL)
+        if match:
+            extracted = match.group(1)
+            try:
+                data_dict = eval("{" + extracted + "}")
+                df = df.append([data_dict], ignore_index=True)
+            except Exception as e:
+                print("Parsing error:", e)
+        else:
+            print("Model response format issue.")
+    return df
+def main():
+    st.set_page_config(page_title="Invoice Extraction Bot")
+    st.title("Invoice Extraction Bot 🤖")
+    st.subheader("Upload your PDF invoices to extract key information!")
+    pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True)
+    submit = st.button("Extract Data")
+    if submit and pdf_files:
+        with st.spinner("Extracting data from invoices..."):
+            df = create_docs(pdf_files)
+            st.write(df)
+            if not df.empty:
+                csv_data = df.to_csv(index=False).encode("utf-8")
+                st.download_button(
+                    "Download CSV",
+                    csv_data,
+                    "invoice_data.csv",
+                    "text/csv",
+                    key="download-csv"
+                )
+        st.success("Data extraction completed! 🎉")
+if __name__ == "__main__":
+    main()