Spend / app.py
Meomap's picture
Update app.py
0bfabb1 verified
raw
history blame
3.84 kB
import re
from transformers import pipeline, AutoTokenizer
from optimum.onnxruntime import ORTModelForTokenClassification
import gradio as gr
# Define categories and their keywords
CATEGORIES = {
"Need": {
"Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
"Housing": ["nha", "thue", "sua chua", "sua nha"],
"Groceries": ["thuc pham", "sieu thi", "rau cu", "do an"],
"Transportation": ["xang", "xe", "ve xe", "bao duong"],
"Education": ["hoc phi", "sach", "truong", "khoa hoc"],
"Medical": ["bao hiem", "bac si", "thuoc"],
"Insurance": ["bao hiem", "nha", "oto", "suc khoe"],
"Childcare": ["tre em", "truong mam non", "nguoi giup viec"],
},
"Want": {
"Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
"Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"],
"Travel": ["du lich", "ve may bay", "khach san"],
"Fitness": ["gym", "yoga", "the thao"],
"Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
"Hobbies": ["so thich", "do choi", "my thuat"],
"Personal Care": ["spa", "toc", "lam dep", "my pham"],
},
"Saving/Investment": {
"Emergency Fund": ["quy du phong"],
"Retirement": ["nghi huu"],
"Investments": ["chung khoan", "bat dong san"],
"Debt Repayment": ["tra no"],
"Education Fund": ["quy hoc tap"],
"Savings for Goals": ["quy tiet kiem"],
"Health Savings": ["bao hiem y te"],
}
}
# Normalize Vietnamese input (remove accents)
def normalize_vietnamese(text):
return re.sub(
r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
).replace("đ", "d")
# Load and quantize the model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True)
# Create the NER pipeline with the quantized model
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
# Classify input
def classify_and_extract(user_input):
normalized_input = normalize_vietnamese(user_input.lower())
# Extract amount using regex
amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
amount = amount_match.group(0) if amount_match else "Unknown"
# Rule-based matching for categories
for main_category, subcategories in CATEGORIES.items():
for subcategory, keywords in subcategories.items():
if any(keyword in normalized_input for keyword in keywords):
return {
"Main Category": main_category,
"Sub Category": subcategory,
"Amount": amount,
"Entities": [] # Skip NER if matched via rules
}
# Fallback to NER model for unmatched cases
ner_results = ner_model(user_input)
return {
"Main Category": "Uncategorized",
"Sub Category": "Unknown",
"Amount": amount,
"Entities": ner_results,
}
# Gradio interface
def process_user_input(user_input):
result = classify_and_extract(user_input)
return (
f"Main Category: {result['Main Category']}\n"
f"Sub Category: {result['Sub Category']}\n"
f"Amount: {result['Amount']}\n"
f"Entities: {result['Entities']}"
)
iface = gr.Interface(
fn=process_user_input,
inputs="text",
outputs="text",
title="Expenditure Classifier",
description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
)
iface.launch()