File size: 2,593 Bytes
0bfabb1 9839b62 0f90513 cb46b54 9839b62 0f90513 19f103a 9839b62 19f103a 9839b62 19f103a 9839b62 0f90513 cb46b54 9839b62 0f90513 0bfabb1 9839b62 0bfabb1 9839b62 0bfabb1 9839b62 0bfabb1 cb46b54 0bfabb1 f57325a 0f90513 9839b62 cb46b54 9839b62 19f103a 9839b62 f57325a 9839b62 19f103a cb46b54 9839b62 0bfabb1 19f103a f57325a 9839b62 19f103a cb46b54 9839b62 cb46b54 f57325a 19f103a f57325a 9839b62 19f103a cb46b54 0f90513 9839b62 cb46b54 9839b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from transformers import pipeline, AutoTokenizer
from optimum.onnxruntime import ORTModelForTokenClassification
import re
import gradio as gr
# Define categories and keywords
CATEGORIES = {
"Need": {
"Housing": ["nha", "thue", "sua nha"],
"Groceries": ["thuc pham", "rau cu", "sieu thi"],
},
"Want": {
"Entertainment": ["phim", "karaoke", "game", "do choi"],
"Dining Out": ["cafe", "nha hang", "tra sua"],
},
"Saving/Investment": {
"Savings": ["quy tiet kiem", "dau tu", "tai san"],
},
}
# Normalize Vietnamese text
def normalize_vietnamese(text):
return re.sub(
r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
).replace("đ", "d")
# Load tokenizer and quantized model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name)
# Create NER pipeline
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
# Classify input
def classify_and_extract(user_input):
normalized_input = normalize_vietnamese(user_input.lower())
amount = re.search(r"\d+", normalized_input)
amount = amount.group(0) if amount else "Unknown"
# Rule-based matching
for main_cat, subcategories in CATEGORIES.items():
for sub_cat, keywords in subcategories.items():
if any(keyword in normalized_input for keyword in keywords):
return {
"Main Category": main_cat,
"Sub Category": sub_cat,
"Amount": amount,
"NER Entities": [],
}
# Fallback to NER model
ner_results = ner_model(user_input)
return {
"Main Category": "Uncategorized",
"Sub Category": "Unknown",
"Amount": amount,
"NER Entities": ner_results,
}
# Gradio app
def process_user_input(user_input):
result = classify_and_extract(user_input)
return (
f"Main Category: {result['Main Category']}\n"
f"Sub Category: {result['Sub Category']}\n"
f"Amount: {result['Amount']}\n"
f"Entities: {result['NER Entities']}"
)
iface = gr.Interface(
fn=process_user_input,
inputs="text",
outputs="text",
title="Expenditure Classifier",
description="Classify and categorize spending."
)
iface.launch(share=True)
|