anisgtboi commited on
Commit
0afbfd4
·
verified ·
1 Parent(s): 5b2f305

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -10
app.py CHANGED
@@ -44,7 +44,7 @@ phrases_data = [
44
  # Future / Present / Past core verbs (ja / de / fawa / ka)
45
  ["jaimu", "জাইমু", "যাব", "jabo"],
46
  ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
47
- ["jaibe", "জাইবে", "তুমি যাবে (friend)", "tumi jabe (friend form)"],
48
  ["jaibo", "জাইবো", "যাবে", "jabe"],
49
  ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
50
  ["oibo", "ওইবো", "হবে", "hobe"],
@@ -77,7 +77,7 @@ phrases_data = [
77
  ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
78
  ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
79
  ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
80
- ["faibe", "ফাইবে", "তুমি পাবে (friend)", "tumi pabe (friend)"],
81
  ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
82
  ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
83
 
@@ -100,6 +100,109 @@ phrases_data = [
100
  ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
101
  ]
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Semantic mapping of dialect patterns to meanings + types
104
  semantic_patterns = {
105
  # question/particles
@@ -230,7 +333,15 @@ def translate_text(user_text, top_k: int = 6):
230
  if q_lower == dialect.lower():
231
  return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
232
 
233
- # 2) If input contains multiple phrases separated by punctuation
 
 
 
 
 
 
 
 
234
  potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
235
  if len(potential_phrases) > 1:
236
  results = []
@@ -245,7 +356,7 @@ def translate_text(user_text, top_k: int = 6):
245
  results.append(f"'{phrase}' → No match found")
246
  return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
247
 
248
- # 3) Semantic matches
249
  semantic_matches = find_semantic_matches(q)
250
  if semantic_matches:
251
  # sort and return top semantic candidates
@@ -258,7 +369,7 @@ def translate_text(user_text, top_k: int = 6):
258
  d, dialect_bengali, actual, benglish = phrases_data[best_idx]
259
  return dialect_bengali, actual, benglish, suggestions
260
 
261
- # 4) Partial matches in dialect strings
262
  partial_matches = []
263
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
264
  if q_lower in dialect.lower() or dialect.lower() in q_lower:
@@ -274,7 +385,7 @@ def translate_text(user_text, top_k: int = 6):
274
  suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
275
  return dialect_bengali, actual, benglish, suggestions
276
 
277
- # 5) Close textual matches using difflib
278
  close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
279
  if close_matches:
280
  indices = [dialects_lower.index(m) for m in close_matches]
@@ -284,9 +395,8 @@ def translate_text(user_text, top_k: int = 6):
284
  suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
285
  return dialect_bengali, actual, benglish, suggestions
286
 
287
- # 6) Nothing found give sample suggestions
288
- sample_phrases = [p[0] for p in phrases_data[:10]]
289
- return "", "", "", "❓ NO MATCH FOUND\n\nTry these sample phrases:\n" + "\n".join([f"• {ph}" for ph in sample_phrases])
290
 
291
  except Exception as ex:
292
  tb = traceback.format_exc()
@@ -331,7 +441,7 @@ with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Sof
331
  with gr.Column(scale=1):
332
  gr.Markdown("### Examples to try:")
333
  examples = gr.Examples(
334
- examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni"],
335
  inputs=inp,
336
  label="Try these examples"
337
  )
 
44
  # Future / Present / Past core verbs (ja / de / fawa / ka)
45
  ["jaimu", "জাইমু", "যাব", "jabo"],
46
  ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
47
+ ["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
48
  ["jaibo", "জাইবো", "যাবে", "jabe"],
49
  ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
50
  ["oibo", "ওইবো", "হবে", "hobe"],
 
77
  ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
78
  ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
79
  ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
80
+ ["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
81
  ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
82
  ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
83
 
 
100
  ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
101
  ]
102
 
103
+ # =============================================================================
104
+ # NEW RULE-BASED TRANSLATION ENGINE
105
+ # =============================================================================
106
+
107
+ # 1. DIALECT LEXICON (A dictionary for unique words)
108
+ # This maps a dialect word to its standard Bengali meaning.
109
+ dialect_lexicon = {
110
+ 'fua': 'chele',
111
+ 'furi': 'meye',
112
+ 'tara': 'tara', # It's the same!
113
+ 'ora': 'tara',
114
+ 'beta': 'purush',
115
+ 'sogra': 'chele',
116
+ 'bakka': 'pagol',
117
+ 'kun': 'ke', # who
118
+ 'ki': 'ki', # what
119
+ 'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
120
+ 'kuno': 'kothay', # where (alternative spelling)
121
+ 'jaibo': 'jabe',
122
+ 'oibo': 'hobe',
123
+ 'dibo': 'debe',
124
+ 'lowna': 'nen', # you take (respectful)
125
+ 'koin': 'bolun', # you say (respectful)
126
+ 'disot': 'diyechile', # you gave (question form)
127
+ 'faisot': 'peyechile', # you got (question form)
128
+ 'ase': 'ache',
129
+ 'or': 'hocche',
130
+ 'oise': 'hoyeche',
131
+ 'bala': 'bhalo',
132
+ 'kub': 'onek',
133
+ 'tik': 'thik',
134
+ 'acha': 'thik',
135
+ 'jen': 'je',
136
+ 'ni': 'ki',
137
+ 'kobor': 'khobor',
138
+ 'korde': 'korchho',
139
+ 'gesle': 'giyechile',
140
+ 'oislo': 'hoyeche',
141
+ 'oigese': 'hoyegese',
142
+ 'jaimu': 'jabo',
143
+ 'jaibay': 'jabe',
144
+ 'jaibe': 'jabe',
145
+ 'jaiba': 'jabe',
146
+ 'des': 'dao',
147
+ 'dis': 'na dio',
148
+ 'dilaisi': 'diyechi',
149
+ 'dilaise': 'diyeche',
150
+ 'dilaisoin': 'diyechen',
151
+ 'dise': 'dey',
152
+ 'faisi': 'peyechi',
153
+ 'faislo': 'pelo',
154
+ 'faislam': 'pelam',
155
+ 'faisla': 'pela',
156
+ 'faimu': 'pabo',
157
+ 'faibay': 'pabe',
158
+ 'faibe': 'pabe',
159
+ 'faibo': 'pabe',
160
+ 'faiba': 'pabe',
161
+ 'kaimu': 'khabo',
162
+ 'kaibay': 'khabe',
163
+ 'kaibe': 'khabe',
164
+ 'kaibo': 'khabe',
165
+ 'kaiba': 'khabe',
166
+ }
167
+
168
+ # 2. THE TRANSLATION FUNCTION
169
+ def translate_with_rules(user_input):
170
+ """
171
+ This is the new core function. It translates a sentence
172
+ using the rule engine and lexicon.
173
+ It returns: (dialect_bengali, actual_bengali, benglish, explanation)
174
+ """
175
+ # Step 1: Tokenize - split the sentence into words
176
+ input_words = user_input.lower().split()
177
+ translated_std_words = [] # This will hold the standard Bengali words
178
+ explanation = [] # This will explain the translation
179
+
180
+ # Step 2: Decode each word using the lexicon
181
+ for word in input_words:
182
+ # Check if the word is in the dialect lexicon
183
+ if word in dialect_lexicon:
184
+ std_word = dialect_lexicon[word]
185
+ translated_std_words.append(std_word)
186
+ explanation.append(f"'{word}' -> '{std_word}'")
187
+ else:
188
+ # If not found, keep the original word (it might be proper noun)
189
+ translated_std_words.append(word)
190
+ explanation.append(f"'{word}' -> ?")
191
+
192
+ # Step 3: Reconstruct the standard Bengali sentence
193
+ standard_sentence = " ".join(translated_std_words)
194
+
195
+ # For now, we'll use the input as dialect form since user typed it
196
+ dialect_sentence = user_input
197
+ # Benglish could be a simple phonetic version
198
+ benglish_sentence = user_input
199
+
200
+ return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation)
201
+
202
+ # =============================================================================
203
+ # END OF NEW TRANSLATION ENGINE
204
+ # =============================================================================
205
+
206
  # Semantic mapping of dialect patterns to meanings + types
207
  semantic_patterns = {
208
  # question/particles
 
333
  if q_lower == dialect.lower():
334
  return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
335
 
336
+ # 2) NEW: Try to translate it using the RULE ENGINE
337
+ # Check if this is a simple phrase that can be broken down
338
+ dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)
339
+
340
+ # If the rule engine found translations for all words, use it!
341
+ if "?" not in explanation: # Basic check - if no unknown words
342
+ return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"
343
+
344
+ # 3) If input contains multiple phrases separated by punctuation
345
  potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
346
  if len(potential_phrases) > 1:
347
  results = []
 
356
  results.append(f"'{phrase}' → No match found")
357
  return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
358
 
359
+ # 4) Semantic matches
360
  semantic_matches = find_semantic_matches(q)
361
  if semantic_matches:
362
  # sort and return top semantic candidates
 
369
  d, dialect_bengali, actual, benglish = phrases_data[best_idx]
370
  return dialect_bengali, actual, benglish, suggestions
371
 
372
+ # 5) Partial matches in dialect strings
373
  partial_matches = []
374
  for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
375
  if q_lower in dialect.lower() or dialect.lower() in q_lower:
 
385
  suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
386
  return dialect_bengali, actual, benglish, suggestions
387
 
388
+ # 6) Close textual matches using difflib
389
  close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
390
  if close_matches:
391
  indices = [dialects_lower.index(m) for m in close_matches]
 
395
  suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
396
  return dialect_bengali, actual, benglish, suggestions
397
 
398
+ # 7) Rule engine fallback (even with some unknown words)
399
+ return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])
 
400
 
401
  except Exception as ex:
402
  tb = traceback.format_exc()
 
441
  with gr.Column(scale=1):
442
  gr.Markdown("### Examples to try:")
443
  examples = gr.Examples(
444
+ examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
445
  inputs=inp,
446
  label="Try these examples"
447
  )