|
import re |
|
from transformers import pipeline, AutoTokenizer |
|
from optimum.onnxruntime import ORTModelForTokenClassification |
|
import gradio as gr |
|
|
|
|
|
CATEGORIES = { |
|
"Need": { |
|
"Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"], |
|
"Housing": ["nha", "thue", "sua chua", "sua nha"], |
|
"Groceries": ["thuc pham", "sieu thi", "rau cu", "do an"], |
|
"Transportation": ["xang", "xe", "ve xe", "bao duong"], |
|
"Education": ["hoc phi", "sach", "truong", "khoa hoc"], |
|
"Medical": ["bao hiem", "bac si", "thuoc"], |
|
"Insurance": ["bao hiem", "nha", "oto", "suc khoe"], |
|
"Childcare": ["tre em", "truong mam non", "nguoi giup viec"], |
|
}, |
|
"Want": { |
|
"Dining Out": ["nha hang", "quan an", "cafe", "tra sua"], |
|
"Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"], |
|
"Travel": ["du lich", "ve may bay", "khach san"], |
|
"Fitness": ["gym", "yoga", "the thao"], |
|
"Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"], |
|
"Hobbies": ["so thich", "do choi", "my thuat"], |
|
"Personal Care": ["spa", "toc", "lam dep", "my pham"], |
|
}, |
|
"Saving/Investment": { |
|
"Emergency Fund": ["quy du phong"], |
|
"Retirement": ["nghi huu"], |
|
"Investments": ["chung khoan", "bat dong san"], |
|
"Debt Repayment": ["tra no"], |
|
"Education Fund": ["quy hoc tap"], |
|
"Savings for Goals": ["quy tiet kiem"], |
|
"Health Savings": ["bao hiem y te"], |
|
} |
|
} |
|
|
|
|
|
def normalize_vietnamese(text): |
|
return re.sub( |
|
r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text |
|
).replace("đ", "d") |
|
|
|
|
|
model_name = "distilbert-base-multilingual-cased" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True) |
|
|
|
|
|
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple") |
|
|
|
|
|
def classify_and_extract(user_input): |
|
normalized_input = normalize_vietnamese(user_input.lower()) |
|
|
|
|
|
amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input) |
|
amount = amount_match.group(0) if amount_match else "Unknown" |
|
|
|
|
|
for main_category, subcategories in CATEGORIES.items(): |
|
for subcategory, keywords in subcategories.items(): |
|
if any(keyword in normalized_input for keyword in keywords): |
|
return { |
|
"Main Category": main_category, |
|
"Sub Category": subcategory, |
|
"Amount": amount, |
|
"Entities": [] |
|
} |
|
|
|
|
|
ner_results = ner_model(user_input) |
|
return { |
|
"Main Category": "Uncategorized", |
|
"Sub Category": "Unknown", |
|
"Amount": amount, |
|
"Entities": ner_results, |
|
} |
|
|
|
|
|
def process_user_input(user_input): |
|
result = classify_and_extract(user_input) |
|
return ( |
|
f"Main Category: {result['Main Category']}\n" |
|
f"Sub Category: {result['Sub Category']}\n" |
|
f"Amount: {result['Amount']}\n" |
|
f"Entities: {result['Entities']}" |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=process_user_input, |
|
inputs="text", |
|
outputs="text", |
|
title="Expenditure Classifier", |
|
description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts." |
|
) |
|
|
|
iface.launch() |
|
|