Balaprime commited on
Commit
ce5499f
·
verified ·
1 Parent(s): b4f1e0d

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +107 -0
utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ from pypdf import PdfReader
4
+ import pandas as pd
5
+ import re
6
+ import torch
7
+
8
+ # Load Mistral model from Hugging Face
9
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ "mistralai/Mistral-7B-Instruct-v0.1",
12
+ device_map="auto",
13
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
14
+ )
15
+ model.eval()
16
+
17
+ # Read PDF text
18
+ def get_pdf_text(pdf_doc):
19
+ text = ""
20
+ reader = PdfReader(pdf_doc)
21
+ for page in reader.pages:
22
+ text += page.extract_text()
23
+ return text
24
+
25
+ # Extract invoice data using the model
26
+ def extracted_data(pages_data):
27
+ prompt = f"""Extract the following values from the text:
28
+ invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address.
29
+
30
+ Text: {pages_data}
31
+
32
+ Output format:
33
+ {{
34
+ 'Invoice no.': '1001329',
35
+ 'Description': 'Office Chair',
36
+ 'Quantity': '2',
37
+ 'Date': '5/4/2023',
38
+ 'Unit price': '1100.00',
39
+ 'Amount': '2200.00',
40
+ 'Total': '2200.00',
41
+ 'Email': '[email protected]',
42
+ 'Phone number': '9999999999',
43
+ 'Address': 'Mumbai, India'
44
+ }}
45
+ """
46
+
47
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
48
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
49
+ with torch.no_grad():
50
+ outputs = model.generate(**inputs, max_new_tokens=512)
51
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
52
+
53
+ return response
54
+
55
+ # Process PDF list and build DataFrame
56
+ def create_docs(user_pdf_list):
57
+ df = pd.DataFrame(columns=[
58
+ 'Invoice no.', 'Description', 'Quantity', 'Date',
59
+ 'Unit price', 'Amount', 'Total', 'Email',
60
+ 'Phone number', 'Address'
61
+ ])
62
+
63
+ for file in user_pdf_list:
64
+ raw_text = get_pdf_text(file)
65
+ llm_output = extracted_data(raw_text)
66
+
67
+ # Try extracting JSON-like data from output
68
+ pattern = r'{(.+)}'
69
+ match = re.search(pattern, llm_output, re.DOTALL)
70
+ if match:
71
+ extracted = match.group(1)
72
+ try:
73
+ data_dict = eval("{" + extracted + "}")
74
+ df = df.append([data_dict], ignore_index=True)
75
+ except Exception as e:
76
+ print("Parsing error:", e)
77
+ else:
78
+ print("Model response format issue.")
79
+
80
+ return df
81
+
82
+ def main():
83
+ st.set_page_config(page_title="Invoice Extraction Bot")
84
+ st.title("Invoice Extraction Bot 🤖")
85
+ st.subheader("Upload your PDF invoices to extract key information!")
86
+
87
+ pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True)
88
+ submit = st.button("Extract Data")
89
+
90
+ if submit and pdf_files:
91
+ with st.spinner("Extracting data from invoices..."):
92
+ df = create_docs(pdf_files)
93
+ st.write(df)
94
+
95
+ if not df.empty:
96
+ csv_data = df.to_csv(index=False).encode("utf-8")
97
+ st.download_button(
98
+ "Download CSV",
99
+ csv_data,
100
+ "invoice_data.csv",
101
+ "text/csv",
102
+ key="download-csv"
103
+ )
104
+ st.success("Data extraction completed! 🎉")
105
+
106
+ if __name__ == "__main__":
107
+ main()