File size: 2,593 Bytes
0bfabb1
 
9839b62
0f90513
cb46b54
9839b62
0f90513
19f103a
9839b62
 
19f103a
 
9839b62
 
19f103a
 
9839b62
 
0f90513
cb46b54
9839b62
0f90513
0bfabb1
 
 
 
9839b62
0bfabb1
 
9839b62
0bfabb1
9839b62
0bfabb1
cb46b54
0bfabb1
f57325a
0f90513
9839b62
 
cb46b54
9839b62
 
 
19f103a
 
9839b62
 
f57325a
9839b62
19f103a
cb46b54
9839b62
0bfabb1
19f103a
 
 
f57325a
9839b62
19f103a
cb46b54
9839b62
cb46b54
f57325a
19f103a
 
 
f57325a
9839b62
19f103a
cb46b54
 
 
 
 
0f90513
9839b62
cb46b54
 
9839b62
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from transformers import pipeline, AutoTokenizer
from optimum.onnxruntime import ORTModelForTokenClassification
import re
import gradio as gr

# Define categories and keywords
CATEGORIES = {
    "Need": {
        "Housing": ["nha", "thue", "sua nha"],
        "Groceries": ["thuc pham", "rau cu", "sieu thi"],
    },
    "Want": {
        "Entertainment": ["phim", "karaoke", "game", "do choi"],
        "Dining Out": ["cafe", "nha hang", "tra sua"],
    },
    "Saving/Investment": {
        "Savings": ["quy tiet kiem", "dau tu", "tai san"],
    },
}

# Normalize Vietnamese text
def normalize_vietnamese(text):
    return re.sub(
        r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
    ).replace("đ", "d")

# Load tokenizer and quantized model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name)

# Create NER pipeline
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")

# Classify input
def classify_and_extract(user_input):
    normalized_input = normalize_vietnamese(user_input.lower())
    amount = re.search(r"\d+", normalized_input)
    amount = amount.group(0) if amount else "Unknown"

    # Rule-based matching
    for main_cat, subcategories in CATEGORIES.items():
        for sub_cat, keywords in subcategories.items():
            if any(keyword in normalized_input for keyword in keywords):
                return {
                    "Main Category": main_cat,
                    "Sub Category": sub_cat,
                    "Amount": amount,
                    "NER Entities": [],
                }

    # Fallback to NER model
    ner_results = ner_model(user_input)
    return {
        "Main Category": "Uncategorized",
        "Sub Category": "Unknown",
        "Amount": amount,
        "NER Entities": ner_results,
    }

# Gradio app
def process_user_input(user_input):
    result = classify_and_extract(user_input)
    return (
        f"Main Category: {result['Main Category']}\n"
        f"Sub Category: {result['Sub Category']}\n"
        f"Amount: {result['Amount']}\n"
        f"Entities: {result['NER Entities']}"
    )

iface = gr.Interface(
    fn=process_user_input,
    inputs="text",
    outputs="text",
    title="Expenditure Classifier",
    description="Classify and categorize spending."
)

iface.launch(share=True)