anisgtboi commited on
Commit
5b2f305
·
verified ·
1 Parent(s): 612a620

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -114
app.py CHANGED
@@ -2,6 +2,7 @@
2
  """
3
  Enhanced Dialect Bengali Translator with Semantic Search
4
  Uses both text similarity and semantic pattern matching
 
5
  """
6
 
7
  import difflib
@@ -10,15 +11,16 @@ import gradio as gr
10
  from collections import defaultdict
11
  import re
12
 
13
- # === Phrase data: [Dialect Bengali, Dialect Bengali Script, Actual Bengali, Benglish] ===
14
  phrases_data = [
15
- ["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"],
16
- ["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"],
17
- ["oigese ni", "ওইগেসে নি", "হয়েগেছে কি?", "hoyegese ki?"],
18
- ["oise", "ওইসে", "হয়েছে", "hoyeche"],
19
- ["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"],
20
- ["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"],
21
- ["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"],
 
22
  ["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"],
23
  ["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"],
24
  ["or", "ওর", "হচ্ছে", "hocche"],
@@ -26,13 +28,7 @@ phrases_data = [
26
  ["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"],
27
  ["or je", "ওর যে", "হচ্ছে যে", "hocche je"],
28
  ["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"],
29
- ["jare ni", "जारे नि", "যাচ্ছো কি?", "jaccho ki?"],
30
- ["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"],
31
- ["jaimu", "জাইমু", "যাব", "jabo"],
32
- ["jaibo", "জাইবো", "যাবে", "jabe"],
33
- ["oibo", "ওইবো", "হবে", "hobe"],
34
- ["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"],
35
- ["Goto kali", "গোতো কালি", "গত কাল", "goto kal"],
36
  ["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"],
37
  ["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"],
38
  ["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"],
@@ -42,43 +38,112 @@ phrases_data = [
42
  ["se hole", "সে হলে", "তাহলে", "tahole"],
43
  ["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"],
44
  ["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"],
45
- ["Sob bala", "সব বালা", "সব ভালো", "sob bhalo"],
46
  ["asoini", "আসইনি", "আছে কি?", "ache ki?"],
47
- ["ase ni", "আছে নি", "আছে কি?", "ache ki?"],
48
  ["ase", "আসে", "আছে", "ache"],
49
- ["Sob", "সব", "সব", "sob"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ]
51
 
52
- # Semantic mapping of dialect patterns to meanings
53
  semantic_patterns = {
54
- "ni": {"meaning": "কি", "type": "question"},
55
- "or": {"meaning": "হচ্ছে", "type": "verb"},
56
- "oise": {"meaning": "হয়েছে", "type": "verb"},
57
- "oibo": {"meaning": "হবে", "type": "verb"},
58
- "jaimu": {"meaning": "যাব", "type": "verb"},
59
- "jaibo": {"meaning": "যাবে", "type": "verb"},
60
- "kobor": {"meaning": "খবর", "type": "noun"},
61
- "korde": {"meaning": "করছে", "type": "verb"},
62
- "acha": {"meaning": "ঠিক", "type": "adjective"},
63
- "bala": {"meaning": "ভালো", "type": "adjective"},
64
- "kub": {"meaning": "অনেক", "type": "adverb"},
65
- "gesle": {"meaning": "গিয়েছিলে", "type": "verb"},
66
- "oislo": {"meaning": "হয়েছে", "type": "verb"},
67
- "oigese": {"meaning": "হয়েগেছে", "type": "verb"},
68
- "jen": {"meaning": "যে", "type": "conjunction"},
69
- "je": {"meaning": "যে", "type": "conjunction"},
70
- "tik": {"meaning": "ঠিক", "type": "adjective"},
71
- "ase": {"meaning": "আছে", "type": "verb"},
72
- "asoin": {"meaning": "আছে", "type": "verb"},
73
- "asoini": {"meaning": "আছে কি", "type": "verb+question"},
74
- "Goto": {"meaning": "গত", "type": "adjective"},
75
- "kali": {"meaning": "কাল", "type": "noun"},
76
- "Kita": {"meaning": "কি", "type": "question"},
77
- "tew": {"meaning": "তাহলে", "type": "conjunction"},
78
- "tente": {"meaning": "তাহলে", "type": "conjunction"},
79
- "to": {"meaning": "তাহলে", "type": "conjunction"},
80
- "se hole": {"meaning": "তাহলে", "type": "conjunction"},
81
- "Sob": {"meaning": "সব", "type": "adjective"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
 
84
  # Precompute data structures for matching
@@ -86,7 +151,7 @@ dialects = [p[0] for p in phrases_data]
86
  dialects_lower = [d.lower() for d in dialects]
87
  actual_bengali_list = [p[2] for p in phrases_data]
88
 
89
- # Create a mapping from dialect to all other forms
90
  dialect_to_all = {p[0].lower(): p for p in phrases_data}
91
 
92
  def semantic_analysis(user_input):
@@ -94,39 +159,45 @@ def semantic_analysis(user_input):
94
  user_lower = user_input.lower()
95
  detected_patterns = []
96
  meaning_components = []
97
-
98
- # Check for semantic patterns
99
  for pattern, info in semantic_patterns.items():
100
- if pattern in user_lower:
101
- detected_patterns.append((pattern, info["meaning"], info["type"]))
102
- meaning_components.append(info["meaning"])
103
-
 
 
 
 
104
  return detected_patterns, meaning_components
105
 
106
- def find_semantic_matches(user_input, threshold=0.4):
107
- """Find matches based on semantic similarity"""
108
  user_lower = user_input.lower()
109
  matches = []
110
-
111
  # Get semantic patterns from user input
112
  detected_patterns, meaning_components = semantic_analysis(user_input)
113
-
114
  # If we found semantic patterns, look for phrases with similar meanings
115
  if meaning_components:
116
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
117
- # Check if the actual Bengali contains any of the meaning components
118
- match_score = 0
119
  for meaning in meaning_components:
120
  if meaning in actual:
121
- match_score += 0.3
122
-
123
- # Also consider text similarity
 
 
124
  text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio()
125
- total_score = match_score + (text_similarity * 0.7)
126
-
127
  if total_score > threshold:
128
  matches.append((i, total_score, "semantic"))
129
-
130
  return matches
131
 
132
  def format_suggestions_from_indices(indices, match_type="text", scores=None):
@@ -134,16 +205,16 @@ def format_suggestions_from_indices(indices, match_type="text", scores=None):
134
  lines = []
135
  for i, idx in enumerate(indices):
136
  d, dialect_bengali, actual, benglish = phrases_data[idx]
137
-
138
  score_str = ""
139
  if scores is not None and i < len(scores):
140
  s_pct = int(scores[i] * 100)
141
  score_str = f" ({match_type}-match: {s_pct}%)"
142
-
143
  lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}")
144
  return "\n\n".join(lines)
145
 
146
- def translate_text(user_text, top_k: int = 5):
147
  """
148
  Returns: (dialect_out, actual_out, benglish_out, suggestions_out)
149
  """
@@ -159,71 +230,62 @@ def translate_text(user_text, top_k: int = 5):
159
  if q_lower == dialect.lower():
160
  return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
161
 
162
- # 2) Check if input contains multiple phrases
163
- potential_phrases = re.split(r'[.,;!?]\s*', q)
164
  if len(potential_phrases) > 1:
165
  results = []
166
  for phrase in potential_phrases:
167
- if phrase.strip():
168
- # Try to match each phrase individually
169
- for d, dialect_bengali, actual, benglish in phrases_data:
170
- if phrase.lower().strip() == d.lower():
171
- results.append(f"{dialect_bengali} {actual} → {benglish}")
172
- break
173
- else:
174
- results.append(f"'{phrase}' → No match found")
175
-
176
- if results:
177
- return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
178
 
179
  # 3) Semantic matches
180
  semantic_matches = find_semantic_matches(q)
181
  if semantic_matches:
 
182
  semantic_matches.sort(key=lambda x: x[1], reverse=True)
183
- best_idx = semantic_matches[0][0]
184
- d, dialect_bengali, actual, benglish = phrases_data[best_idx]
185
-
186
- # Format suggestions
187
- indices = [idx for idx, score, match_type in semantic_matches[:top_k]]
188
- scores = [score for idx, score, match_type in semantic_matches[:top_k]]
189
  suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores)
 
 
 
190
  return dialect_bengali, actual, benglish, suggestions
191
 
192
- # 4) Partial matches in dialect
193
  partial_matches = []
194
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
195
  if q_lower in dialect.lower() or dialect.lower() in q_lower:
196
  similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio()
197
  partial_matches.append((i, similarity))
198
-
199
  if partial_matches:
200
  partial_matches.sort(key=lambda x: x[1], reverse=True)
201
- best_idx = partial_matches[0][0]
202
- d, dialect_bengali, actual, benglish = phrases_data[best_idx]
203
-
204
- # Format suggestions
205
  indices = [idx for idx, score in partial_matches[:top_k]]
206
  scores = [score for idx, score in partial_matches[:top_k]]
 
 
207
  suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
208
  return dialect_bengali, actual, benglish, suggestions
209
 
210
- # 5) difflib close textual matches in dialect
211
  close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
212
  if close_matches:
213
- best_text = close_matches[0]
214
- idx = dialects_lower.index(best_text)
215
- d, dialect_bengali, actual, benglish = phrases_data[idx]
216
-
217
- text_sim_scores = []
218
- for m in close_matches:
219
- score = difflib.SequenceMatcher(None, q_lower, m).ratio()
220
- text_sim_scores.append(score)
221
  indices = [dialects_lower.index(m) for m in close_matches]
 
 
 
222
  suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
223
  return dialect_bengali, actual, benglish, suggestions
224
 
225
- # 6) Nothing found
226
- sample_phrases = [p[0] for p in phrases_data[:8]]
227
  return "", "", "", "❓ NO MATCH FOUND\n\nTry these sample phrases:\n" + "\n".join([f"• {ph}" for ph in sample_phrases])
228
 
229
  except Exception as ex:
@@ -236,7 +298,7 @@ def show_semantic_analysis(user_text):
236
  return ""
237
  patterns, meanings = semantic_analysis(user_text)
238
  if patterns:
239
- return f"Detected patterns: {', '.join([f'{p}→{m}' for p, m, t in patterns])}"
240
  return "No specific patterns detected"
241
 
242
  # Custom CSS for a softer, less blinding color scheme
@@ -261,10 +323,10 @@ body {
261
  with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo:
262
  gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish")
263
  gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.")
264
-
265
  # Define input component first
266
  inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni")
267
-
268
  with gr.Row():
269
  with gr.Column(scale=1):
270
  gr.Markdown("### Examples to try:")
@@ -275,24 +337,24 @@ with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Sof
275
  )
276
  with gr.Column(scale=2):
277
  btn = gr.Button("Translate / Find", variant="primary")
278
-
279
  with gr.Row():
280
  out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)")
281
  out_actual = gr.Textbox(label="Actual Bengali (Standard)")
282
  out_benglish = gr.Textbox(label="Benglish (Phonetic English)")
283
-
284
  with gr.Row():
285
  semantic_info = gr.Textbox(label="Semantic Analysis", lines=2)
286
-
287
  suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8)
288
 
289
  # Set up event handlers
290
  btn.click(
291
- fn=translate_text,
292
- inputs=[inp],
293
  outputs=[out_dialect, out_actual, out_benglish, suggestions]
294
  )
295
-
296
  inp.change(
297
  fn=show_semantic_analysis,
298
  inputs=[inp],
 
2
  """
3
  Enhanced Dialect Bengali Translator with Semantic Search
4
  Uses both text similarity and semantic pattern matching
5
+ Updated to include new dialect patterns and polite/negative 'des/dis' behavior
6
  """
7
 
8
  import difflib
 
11
  from collections import defaultdict
12
  import re
13
 
14
+ # === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] ===
15
  phrases_data = [
16
+ # Questions / common
17
+ ["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"],
18
+ ["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"],
19
+ ["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"],
20
+ ["oise", "ওইসে", "হয়েছে", "hoyeche"],
21
+ ["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"],
22
+ ["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"],
23
+ ["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"],
24
  ["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"],
25
  ["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"],
26
  ["or", "ওর", "হচ্ছে", "hocche"],
 
28
  ["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"],
29
  ["or je", "ওর যে", "হচ্ছে যে", "hocche je"],
30
  ["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"],
31
+ ["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"],
 
 
 
 
 
 
32
  ["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"],
33
  ["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"],
34
  ["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"],
 
38
  ["se hole", "সে হলে", "তাহলে", "tahole"],
39
  ["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"],
40
  ["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"],
 
41
  ["asoini", "আসইনি", "আছে কি?", "ache ki?"],
 
42
  ["ase", "আসে", "আছে", "ache"],
43
+
44
+ # Future / Present / Past core verbs (ja / de / fawa / ka)
45
+ ["jaimu", "জাইমু", "যাব", "jabo"],
46
+ ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
47
+ ["jaibe", "জাইবে", "তুমি যাবে (friend)", "tumi jabe (friend form)"],
48
+ ["jaibo", "জাইবো", "যাবে", "jabe"],
49
+ ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
50
+ ["oibo", "ওইবো", "হবে", "hobe"],
51
+ ["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"],
52
+
53
+ ["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"],
54
+ ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
55
+ ["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"],
56
+
57
+ # Give (de) family
58
+ ["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"],
59
+ ["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"],
60
+ ["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"],
61
+ ["dis na", "দিস না", "দেও না", "dis na (don't give)"],
62
+ ["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"],
63
+ ["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"],
64
+ ["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"],
65
+ ["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"],
66
+ ["dibo", "দিবো", "দেব", "debo (will give)"],
67
+ ["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"],
68
+ ["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"],
69
+
70
+ # Get / receive (fawa) family
71
+ ["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"],
72
+ ["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"],
73
+ ["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"],
74
+ ["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"],
75
+ ["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"],
76
+ ["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"],
77
+ ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
78
+ ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
79
+ ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
80
+ ["faibe", "ফাইবে", "তুমি পাবে (friend)", "tumi pabe (friend)"],
81
+ ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
82
+ ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
83
+
84
+ # Eat (ka) family
85
+ ["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"],
86
+ ["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"],
87
+ ["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"],
88
+ ["kaibo", "কাইবো", "সে খাবে", "se khabe"],
89
+ ["kaiba", "কাইবা", "তারা খাবে", "tara khabe"],
90
+
91
+ # Other sample sentences from user's corpus
92
+ ["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"],
93
+ ["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"],
94
+ ["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"],
95
+ ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
96
+ ["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"],
97
+ ["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"],
98
+ ["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"],
99
+ ["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"],
100
+ ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
101
  ]
102
 
103
+ # Semantic mapping of dialect patterns to meanings + types
104
  semantic_patterns = {
105
+ # question/particles
106
+ r"\bni\b": {"meaning": "কি", "type": "question"},
107
+ r"\bni\b$": {"meaning": "কি", "type": "question"},
108
+ # verbs / roots
109
+ r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"},
110
+ r"\boise\b": {"meaning": "হয়েছে", "type": "verb"},
111
+ r"\boibo\b": {"meaning": "হবে", "type": "verb"},
112
+ r"\bjaimu\b": {"meaning": "যাব", "type": "verb"},
113
+ r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"},
114
+ r"\bkobor\b": {"meaning": "খবর", "type": "noun"},
115
+ r"\bkorde\b": {"meaning": "করছে", "type": "verb"},
116
+ r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"},
117
+ r"\bbala\b": {"meaning": "ভালো", "type": "adjective"},
118
+ r"\bkub\b": {"meaning": "অনেক", "type": "adverb"},
119
+ r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"},
120
+ r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"},
121
+ r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"},
122
+ r"\bjen\b": {"meaning": "যে", "type": "conjunction"},
123
+ r"\bje\b": {"meaning": "যে", "type": "conjunction"},
124
+ r"\btik\b": {"meaning": "ঠিক", "type": "adjective"},
125
+ r"\base\b": {"meaning": "আছে", "type": "verb"},
126
+ r"\basoin\b": {"meaning": "আছে", "type": "verb"},
127
+ r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"},
128
+ r"\bGoto\b": {"meaning": "গত", "type": "adjective"},
129
+ r"\bkali\b": {"meaning": "কাল", "type": "noun"},
130
+ r"\bkita\b": {"meaning": "কি", "type": "question"},
131
+ r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"},
132
+ # give/get polarity (important dialect contrast)
133
+ r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"},
134
+ r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"},
135
+ r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"},
136
+ r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"},
137
+ # fawa/get variants
138
+ r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"},
139
+ r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"},
140
+ r"\bfaimu\b": {"meaning": "পাব", "type": "verb"},
141
+ r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"},
142
+ # future pattern markers
143
+ r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"},
144
+ r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"},
145
+ r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"},
146
+ r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"},
147
  }
148
 
149
  # Precompute data structures for matching
 
151
  dialects_lower = [d.lower() for d in dialects]
152
  actual_bengali_list = [p[2] for p in phrases_data]
153
 
154
+ # Create a mapping from dialect to full row
155
  dialect_to_all = {p[0].lower(): p for p in phrases_data}
156
 
157
  def semantic_analysis(user_input):
 
159
  user_lower = user_input.lower()
160
  detected_patterns = []
161
  meaning_components = []
162
+
163
+ # Use regex-based whole-word matching for patterns
164
  for pattern, info in semantic_patterns.items():
165
+ try:
166
+ if re.search(pattern, user_lower):
167
+ detected_patterns.append((pattern, info["meaning"], info["type"]))
168
+ meaning_components.append(info["meaning"])
169
+ except re.error:
170
+ # If pattern is bad, skip it safely
171
+ continue
172
+
173
  return detected_patterns, meaning_components
174
 
175
+ def find_semantic_matches(user_input, threshold=0.35):
176
+ """Find matches based on semantic similarity + text similarity"""
177
  user_lower = user_input.lower()
178
  matches = []
179
+
180
  # Get semantic patterns from user input
181
  detected_patterns, meaning_components = semantic_analysis(user_input)
182
+
183
  # If we found semantic patterns, look for phrases with similar meanings
184
  if meaning_components:
185
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
186
+ match_score = 0.0
187
+ # boost if any of the meaning_components appear in actual or dialect
188
  for meaning in meaning_components:
189
  if meaning in actual:
190
+ match_score += 0.35
191
+ if meaning in dialect.lower():
192
+ match_score += 0.25
193
+
194
+ # text similarity between user and dialect form
195
  text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio()
196
+ total_score = match_score + (text_similarity * 0.5)
197
+
198
  if total_score > threshold:
199
  matches.append((i, total_score, "semantic"))
200
+
201
  return matches
202
 
203
  def format_suggestions_from_indices(indices, match_type="text", scores=None):
 
205
  lines = []
206
  for i, idx in enumerate(indices):
207
  d, dialect_bengali, actual, benglish = phrases_data[idx]
208
+
209
  score_str = ""
210
  if scores is not None and i < len(scores):
211
  s_pct = int(scores[i] * 100)
212
  score_str = f" ({match_type}-match: {s_pct}%)"
213
+
214
  lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}")
215
  return "\n\n".join(lines)
216
 
217
+ def translate_text(user_text, top_k: int = 6):
218
  """
219
  Returns: (dialect_out, actual_out, benglish_out, suggestions_out)
220
  """
 
230
  if q_lower == dialect.lower():
231
  return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
232
 
233
+ # 2) If input contains multiple phrases separated by punctuation
234
+ potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
235
  if len(potential_phrases) > 1:
236
  results = []
237
  for phrase in potential_phrases:
238
+ matched = False
239
+ for d, dialect_bengali, actual, benglish in phrases_data:
240
+ if phrase.lower() == d.lower():
241
+ results.append(f"{dialect_bengali} {actual} → {benglish}")
242
+ matched = True
243
+ break
244
+ if not matched:
245
+ results.append(f"'{phrase}' → No match found")
246
+ return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
 
 
247
 
248
  # 3) Semantic matches
249
  semantic_matches = find_semantic_matches(q)
250
  if semantic_matches:
251
+ # sort and return top semantic candidates
252
  semantic_matches.sort(key=lambda x: x[1], reverse=True)
253
+ indices = [idx for idx, score, mt in semantic_matches[:top_k]]
254
+ scores = [score for idx, score, mt in semantic_matches[:top_k]]
 
 
 
 
255
  suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores)
256
+ # Return best match as primary output
257
+ best_idx = indices[0]
258
+ d, dialect_bengali, actual, benglish = phrases_data[best_idx]
259
  return dialect_bengali, actual, benglish, suggestions
260
 
261
+ # 4) Partial matches in dialect strings
262
  partial_matches = []
263
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
264
  if q_lower in dialect.lower() or dialect.lower() in q_lower:
265
  similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio()
266
  partial_matches.append((i, similarity))
267
+
268
  if partial_matches:
269
  partial_matches.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
270
  indices = [idx for idx, score in partial_matches[:top_k]]
271
  scores = [score for idx, score in partial_matches[:top_k]]
272
+ best_idx = indices[0]
273
+ d, dialect_bengali, actual, benglish = phrases_data[best_idx]
274
  suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
275
  return dialect_bengali, actual, benglish, suggestions
276
 
277
+ # 5) Close textual matches using difflib
278
  close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
279
  if close_matches:
 
 
 
 
 
 
 
 
280
  indices = [dialects_lower.index(m) for m in close_matches]
281
+ text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches]
282
+ best_idx = indices[0]
283
+ d, dialect_bengali, actual, benglish = phrases_data[best_idx]
284
  suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
285
  return dialect_bengali, actual, benglish, suggestions
286
 
287
+ # 6) Nothing found — give sample suggestions
288
+ sample_phrases = [p[0] for p in phrases_data[:10]]
289
  return "", "", "", "❓ NO MATCH FOUND\n\nTry these sample phrases:\n" + "\n".join([f"• {ph}" for ph in sample_phrases])
290
 
291
  except Exception as ex:
 
298
  return ""
299
  patterns, meanings = semantic_analysis(user_text)
300
  if patterns:
301
+ return f"Detected patterns: {', '.join([f'{p} {m}' for p, m, t in patterns])}"
302
  return "No specific patterns detected"
303
 
304
  # Custom CSS for a softer, less blinding color scheme
 
323
  with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo:
324
  gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish")
325
  gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.")
326
+
327
  # Define input component first
328
  inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni")
329
+
330
  with gr.Row():
331
  with gr.Column(scale=1):
332
  gr.Markdown("### Examples to try:")
 
337
  )
338
  with gr.Column(scale=2):
339
  btn = gr.Button("Translate / Find", variant="primary")
340
+
341
  with gr.Row():
342
  out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)")
343
  out_actual = gr.Textbox(label="Actual Bengali (Standard)")
344
  out_benglish = gr.Textbox(label="Benglish (Phonetic English)")
345
+
346
  with gr.Row():
347
  semantic_info = gr.Textbox(label="Semantic Analysis", lines=2)
348
+
349
  suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8)
350
 
351
  # Set up event handlers
352
  btn.click(
353
+ fn=translate_text,
354
+ inputs=[inp],
355
  outputs=[out_dialect, out_actual, out_benglish, suggestions]
356
  )
357
+
358
  inp.change(
359
  fn=show_semantic_analysis,
360
  inputs=[inp],