nehakothari commited on
Commit
308feef
·
verified ·
1 Parent(s): 3a60e02

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ import torch
5
+ import pandas as pd
6
+ import pytesseract
7
+ import cv2
8
+ import pymssql
9
+
10
+ app = Flask(__name__)
11
+
12
+ # Initialize model and processor
13
+ model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ", torch_dtype="auto")
14
+ if torch.cuda.is_available():
15
+ model.to("cuda")
16
+
17
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ")
18
+ pytesseract.pytesseract_cmd = r'/usr/bin/tesseract'
19
+
20
+ # Function to identify category based on keywords
21
+ def identify_category(text):
22
+ text = text.lower()
23
+ if any(keyword in text for keyword in ["food", "meal", "restaurant", "cafe", "coffee", "drink"]):
24
+ return "Food"
25
+ elif any(keyword in text for keyword in ["travel", "flight", "bus", "car", "taxi", "train", "ticket"]):
26
+ return "Travel"
27
+ elif any(keyword in text for keyword in ["hotel", "stay", "room", "resort", "accommodation"]):
28
+ return "Stay"
29
+ else:
30
+ return "Others"
31
+
32
+ # Store DataFrame to Azure SQL Database
33
+ def store_to_azure_sql(dataframe):
34
+ try:
35
+ conn = pymssql.connect(
36
+ server="piosqlserverbd.database.windows.net",
37
+ user="pio-admin",
38
+ password="Poctest123#",
39
+ database="PIOSqlDB"
40
+ )
41
+ cursor = conn.cursor()
42
+
43
+ create_table_query = """
44
+ IF NOT EXISTS (SELECT * FROM sysobjects WHERE name='Invoices' AND xtype='U')
45
+ CREATE TABLE Invoices (
46
+ EmployeeID NVARCHAR(50) NOT NULL PRIMARY KEY,
47
+ InvoiceNumber NVARCHAR(255),
48
+ Date NVARCHAR(255),
49
+ Place NVARCHAR(255),
50
+ Amount NVARCHAR(255),
51
+ Category NVARCHAR(255),
52
+ ApprovalStatus NVARCHAR(50) DEFAULT 'Pending'
53
+ )
54
+ """
55
+ cursor.execute(create_table_query)
56
+
57
+ cursor.execute("SELECT TOP 1 EmployeeID FROM Invoices ORDER BY EmployeeID DESC")
58
+ last_id = cursor.fetchone()
59
+ next_id = 0 if last_id is None else int(last_id[0]) + 1
60
+
61
+ for _, row in dataframe.iterrows():
62
+ category = identify_category(row["Invoice Details"])
63
+ insert_query = """
64
+ INSERT INTO Invoices (EmployeeID, InvoiceNumber, Date, Place, Amount, Category, ApprovalStatus)
65
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
66
+ """
67
+ cursor.execute(
68
+ insert_query,
69
+ (
70
+ f"{next_id:03d}",
71
+ row.get("Invoice Number", "")[:255],
72
+ row.get("Date", ""),
73
+ row.get("Place", ""),
74
+ row.get("Amount", ""),
75
+ category,
76
+ "Pending"
77
+ )
78
+ )
79
+ next_id += 1
80
+
81
+ conn.commit()
82
+ conn.close()
83
+ return "Data successfully stored in Azure SQL Database."
84
+ except Exception as e:
85
+ return f"Error storing data to database: {e}"
86
+
87
+ # Process image and extract details
88
+ def process_image(image_path):
89
+ messages = [{
90
+ "role": "user",
91
+ "content": [
92
+ {"type": "image", "image": image_path},
93
+ {"type": "text", "text": (
94
+ "Extract the following details from the invoice:\n"
95
+ "- 'invoice_number'\n"
96
+ "- 'date'\n"
97
+ "- 'place'\n"
98
+ "- 'amount' (monetary value in the relevant currency)\n"
99
+ "- 'category' (based on the invoice type)"
100
+ )}
101
+ ]
102
+ }]
103
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
104
+ image_inputs, video_inputs = process_vision_info(messages)
105
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
106
+ inputs = inputs.to(model.device)
107
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
108
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
109
+ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
110
+ return parse_details(output_text[0])
111
+
112
+ def parse_details(details):
113
+ parsed_data = {
114
+ "Invoice Number": None,
115
+ "Date": None,
116
+ "Place": None,
117
+ "Amount": None,
118
+ "Invoice Details": details
119
+ }
120
+ lines = details.split("\n")
121
+ for line in lines:
122
+ lower_line = line.lower()
123
+ if "invoice" in lower_line:
124
+ parsed_data["Invoice Number"] = line.split(":")[-1].strip()
125
+ elif "date" in lower_line:
126
+ parsed_data["Date"] = line.split(":")[-1].strip()
127
+ elif "place" in lower_line:
128
+ parsed_data["Place"] = line.split(":")[-1].strip()
129
+ elif any(keyword in lower_line for keyword in ["total", "amount", "cost"]):
130
+ parsed_data["Amount"] = line.split(":")[-1].strip()
131
+ return parsed_data
132
+
133
+ @app.route('/extract', methods=['POST'])
134
+ def extract_invoice():
135
+ image_path = request.json.get('image_path')
136
+ extracted_data = process_image(image_path)
137
+ df = pd.DataFrame([extracted_data])
138
+ status = store_to_azure_sql(df)
139
+ return jsonify({"data": extracted_data, "status": status})
140
+
141
+ if __name__ == '__main__':
142
+ app.run(port=22)