Meomap commited on
Commit
0bfabb1
·
verified ·
1 Parent(s): f57325a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import re
2
- from transformers import pipeline
 
3
  import gradio as gr
4
 
5
- # Load a pre-trained multilingual NER model for entity recognition
6
- ner_model = pipeline("ner", model="dbmdz/bert-base-multilingual-cased", aggregation_strategy="simple")
7
-
8
- # Define categories and their associated keywords
9
  CATEGORIES = {
10
  "Need": {
11
  "Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
@@ -19,7 +17,7 @@ CATEGORIES = {
19
  },
20
  "Want": {
21
  "Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
22
- "Entertainment": ["phim", "karaoke", "game", "nhac"],
23
  "Travel": ["du lich", "ve may bay", "khach san"],
24
  "Fitness": ["gym", "yoga", "the thao"],
25
  "Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
@@ -39,9 +37,19 @@ CATEGORIES = {
39
 
40
  # Normalize Vietnamese input (remove accents)
41
  def normalize_vietnamese(text):
42
- return re.sub(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text).replace("đ", "d")
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Extract entities and classify
45
  def classify_and_extract(user_input):
46
  normalized_input = normalize_vietnamese(user_input.lower())
47
 
@@ -49,10 +57,7 @@ def classify_and_extract(user_input):
49
  amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
50
  amount = amount_match.group(0) if amount_match else "Unknown"
51
 
52
- # Run the NER model to detect entities
53
- ner_results = ner_model(user_input)
54
-
55
- # Match keywords for categories
56
  for main_category, subcategories in CATEGORIES.items():
57
  for subcategory, keywords in subcategories.items():
58
  if any(keyword in normalized_input for keyword in keywords):
@@ -60,10 +65,11 @@ def classify_and_extract(user_input):
60
  "Main Category": main_category,
61
  "Sub Category": subcategory,
62
  "Amount": amount,
63
- "Entities": ner_results,
64
  }
65
 
66
- # Default response if no match
 
67
  return {
68
  "Main Category": "Uncategorized",
69
  "Sub Category": "Unknown",
@@ -89,4 +95,4 @@ iface = gr.Interface(
89
  description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
90
  )
91
 
92
- iface.launch()
 
1
  import re
2
+ from transformers import pipeline, AutoTokenizer
3
+ from optimum.onnxruntime import ORTModelForTokenClassification
4
  import gradio as gr
5
 
6
+ # Define categories and their keywords
 
 
 
7
  CATEGORIES = {
8
  "Need": {
9
  "Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
 
17
  },
18
  "Want": {
19
  "Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
20
+ "Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"],
21
  "Travel": ["du lich", "ve may bay", "khach san"],
22
  "Fitness": ["gym", "yoga", "the thao"],
23
  "Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
 
37
 
38
  # Normalize Vietnamese input (remove accents)
39
  def normalize_vietnamese(text):
40
+ return re.sub(
41
+ r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
42
+ ).replace("đ", "d")
43
+
44
+ # Load and quantize the model
45
+ model_name = "distilbert-base-multilingual-cased"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True)
48
+
49
+ # Create the NER pipeline with the quantized model
50
+ ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
51
 
52
+ # Classify input
53
  def classify_and_extract(user_input):
54
  normalized_input = normalize_vietnamese(user_input.lower())
55
 
 
57
  amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
58
  amount = amount_match.group(0) if amount_match else "Unknown"
59
 
60
+ # Rule-based matching for categories
 
 
 
61
  for main_category, subcategories in CATEGORIES.items():
62
  for subcategory, keywords in subcategories.items():
63
  if any(keyword in normalized_input for keyword in keywords):
 
65
  "Main Category": main_category,
66
  "Sub Category": subcategory,
67
  "Amount": amount,
68
+ "Entities": [] # Skip NER if matched via rules
69
  }
70
 
71
+ # Fallback to NER model for unmatched cases
72
+ ner_results = ner_model(user_input)
73
  return {
74
  "Main Category": "Uncategorized",
75
  "Sub Category": "Unknown",
 
95
  description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
96
  )
97
 
98
+ iface.launch()