Spaces:

Meomap
/

Spend

Sleeping

App Files Files Community

Spend / app.py

Meomap

Update app.py

0bfabb1 verified 6 months ago

raw

history blame

3.84 kB

	import re
	from transformers import pipeline, AutoTokenizer
	from optimum.onnxruntime import ORTModelForTokenClassification
	import gradio as gr

	# Define categories and their keywords
	CATEGORIES = {
	"Need": {
	"Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
	"Housing": ["nha", "thue", "sua chua", "sua nha"],
	"Groceries": ["thuc pham", "sieu thi", "rau cu", "do an"],
	"Transportation": ["xang", "xe", "ve xe", "bao duong"],
	"Education": ["hoc phi", "sach", "truong", "khoa hoc"],
	"Medical": ["bao hiem", "bac si", "thuoc"],
	"Insurance": ["bao hiem", "nha", "oto", "suc khoe"],
	"Childcare": ["tre em", "truong mam non", "nguoi giup viec"],
	},
	"Want": {
	"Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
	"Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"],
	"Travel": ["du lich", "ve may bay", "khach san"],
	"Fitness": ["gym", "yoga", "the thao"],
	"Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
	"Hobbies": ["so thich", "do choi", "my thuat"],
	"Personal Care": ["spa", "toc", "lam dep", "my pham"],
	},
	"Saving/Investment": {
	"Emergency Fund": ["quy du phong"],
	"Retirement": ["nghi huu"],
	"Investments": ["chung khoan", "bat dong san"],
	"Debt Repayment": ["tra no"],
	"Education Fund": ["quy hoc tap"],
	"Savings for Goals": ["quy tiet kiem"],
	"Health Savings": ["bao hiem y te"],
	}
	}

	# Normalize Vietnamese input (remove accents)
	def normalize_vietnamese(text):
	return re.sub(
	r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
	).replace("đ", "d")

	# Load and quantize the model
	model_name = "distilbert-base-multilingual-cased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True)

	# Create the NER pipeline with the quantized model
	ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")

	# Classify input
	def classify_and_extract(user_input):
	normalized_input = normalize_vietnamese(user_input.lower())

	# Extract amount using regex
	amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
	amount = amount_match.group(0) if amount_match else "Unknown"

	# Rule-based matching for categories
	for main_category, subcategories in CATEGORIES.items():
	for subcategory, keywords in subcategories.items():
	if any(keyword in normalized_input for keyword in keywords):
	return {
	"Main Category": main_category,
	"Sub Category": subcategory,
	"Amount": amount,
	"Entities": [] # Skip NER if matched via rules
	}

	# Fallback to NER model for unmatched cases
	ner_results = ner_model(user_input)
	return {
	"Main Category": "Uncategorized",
	"Sub Category": "Unknown",
	"Amount": amount,
	"Entities": ner_results,
	}

	# Gradio interface
	def process_user_input(user_input):
	result = classify_and_extract(user_input)
	return (
	f"Main Category: {result['Main Category']}\n"
	f"Sub Category: {result['Sub Category']}\n"
	f"Amount: {result['Amount']}\n"
	f"Entities: {result['Entities']}"
	)

	iface = gr.Interface(
	fn=process_user_input,
	inputs="text",
	outputs="text",
	title="Expenditure Classifier",
	description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
	)

	iface.launch()