Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,7 +44,7 @@ phrases_data = [
|
|
44 |
# Future / Present / Past core verbs (ja / de / fawa / ka)
|
45 |
["jaimu", "জাইমু", "যাব", "jabo"],
|
46 |
["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
|
47 |
-
["jaibe", "জাইবে", "
|
48 |
["jaibo", "জাইবো", "যাবে", "jabe"],
|
49 |
["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
|
50 |
["oibo", "ওইবো", "হবে", "hobe"],
|
@@ -77,7 +77,7 @@ phrases_data = [
|
|
77 |
["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
|
78 |
["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
|
79 |
["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
|
80 |
-
["faibe", "
|
81 |
["faibo", "ফাইবো", "সে পাবে", "se pabe"],
|
82 |
["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
|
83 |
|
@@ -100,6 +100,109 @@ phrases_data = [
|
|
100 |
["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
|
101 |
]
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
# Semantic mapping of dialect patterns to meanings + types
|
104 |
semantic_patterns = {
|
105 |
# question/particles
|
@@ -230,7 +333,15 @@ def translate_text(user_text, top_k: int = 6):
|
|
230 |
if q_lower == dialect.lower():
|
231 |
return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
|
232 |
|
233 |
-
# 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
|
235 |
if len(potential_phrases) > 1:
|
236 |
results = []
|
@@ -245,7 +356,7 @@ def translate_text(user_text, top_k: int = 6):
|
|
245 |
results.append(f"'{phrase}' → No match found")
|
246 |
return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
|
247 |
|
248 |
-
#
|
249 |
semantic_matches = find_semantic_matches(q)
|
250 |
if semantic_matches:
|
251 |
# sort and return top semantic candidates
|
@@ -258,7 +369,7 @@ def translate_text(user_text, top_k: int = 6):
|
|
258 |
d, dialect_bengali, actual, benglish = phrases_data[best_idx]
|
259 |
return dialect_bengali, actual, benglish, suggestions
|
260 |
|
261 |
-
#
|
262 |
partial_matches = []
|
263 |
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
|
264 |
if q_lower in dialect.lower() or dialect.lower() in q_lower:
|
@@ -274,7 +385,7 @@ def translate_text(user_text, top_k: int = 6):
|
|
274 |
suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
|
275 |
return dialect_bengali, actual, benglish, suggestions
|
276 |
|
277 |
-
#
|
278 |
close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
|
279 |
if close_matches:
|
280 |
indices = [dialects_lower.index(m) for m in close_matches]
|
@@ -284,9 +395,8 @@ def translate_text(user_text, top_k: int = 6):
|
|
284 |
suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
|
285 |
return dialect_bengali, actual, benglish, suggestions
|
286 |
|
287 |
-
#
|
288 |
-
|
289 |
-
return "", "", "", "❓ NO MATCH FOUND\n\nTry these sample phrases:\n" + "\n".join([f"• {ph}" for ph in sample_phrases])
|
290 |
|
291 |
except Exception as ex:
|
292 |
tb = traceback.format_exc()
|
@@ -331,7 +441,7 @@ with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Sof
|
|
331 |
with gr.Column(scale=1):
|
332 |
gr.Markdown("### Examples to try:")
|
333 |
examples = gr.Examples(
|
334 |
-
examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni"],
|
335 |
inputs=inp,
|
336 |
label="Try these examples"
|
337 |
)
|
|
|
44 |
# Future / Present / Past core verbs (ja / de / fawa / ka)
|
45 |
["jaimu", "জাইমু", "যাব", "jabo"],
|
46 |
["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
|
47 |
+
["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
|
48 |
["jaibo", "জাইবো", "যাবে", "jabe"],
|
49 |
["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
|
50 |
["oibo", "ওইবো", "হবে", "hobe"],
|
|
|
77 |
["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
|
78 |
["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
|
79 |
["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
|
80 |
+
["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
|
81 |
["faibo", "ফাইবো", "সে পাবে", "se pabe"],
|
82 |
["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
|
83 |
|
|
|
100 |
["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
|
101 |
]
|
102 |
|
103 |
+
# =============================================================================
|
104 |
+
# NEW RULE-BASED TRANSLATION ENGINE
|
105 |
+
# =============================================================================
|
106 |
+
|
107 |
+
# 1. DIALECT LEXICON (A dictionary for unique words)
|
108 |
+
# This maps a dialect word to its standard Bengali meaning.
|
109 |
+
dialect_lexicon = {
|
110 |
+
'fua': 'chele',
|
111 |
+
'furi': 'meye',
|
112 |
+
'tara': 'tara', # It's the same!
|
113 |
+
'ora': 'tara',
|
114 |
+
'beta': 'purush',
|
115 |
+
'sogra': 'chele',
|
116 |
+
'bakka': 'pagol',
|
117 |
+
'kun': 'ke', # who
|
118 |
+
'ki': 'ki', # what
|
119 |
+
'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
|
120 |
+
'kuno': 'kothay', # where (alternative spelling)
|
121 |
+
'jaibo': 'jabe',
|
122 |
+
'oibo': 'hobe',
|
123 |
+
'dibo': 'debe',
|
124 |
+
'lowna': 'nen', # you take (respectful)
|
125 |
+
'koin': 'bolun', # you say (respectful)
|
126 |
+
'disot': 'diyechile', # you gave (question form)
|
127 |
+
'faisot': 'peyechile', # you got (question form)
|
128 |
+
'ase': 'ache',
|
129 |
+
'or': 'hocche',
|
130 |
+
'oise': 'hoyeche',
|
131 |
+
'bala': 'bhalo',
|
132 |
+
'kub': 'onek',
|
133 |
+
'tik': 'thik',
|
134 |
+
'acha': 'thik',
|
135 |
+
'jen': 'je',
|
136 |
+
'ni': 'ki',
|
137 |
+
'kobor': 'khobor',
|
138 |
+
'korde': 'korchho',
|
139 |
+
'gesle': 'giyechile',
|
140 |
+
'oislo': 'hoyeche',
|
141 |
+
'oigese': 'hoyegese',
|
142 |
+
'jaimu': 'jabo',
|
143 |
+
'jaibay': 'jabe',
|
144 |
+
'jaibe': 'jabe',
|
145 |
+
'jaiba': 'jabe',
|
146 |
+
'des': 'dao',
|
147 |
+
'dis': 'na dio',
|
148 |
+
'dilaisi': 'diyechi',
|
149 |
+
'dilaise': 'diyeche',
|
150 |
+
'dilaisoin': 'diyechen',
|
151 |
+
'dise': 'dey',
|
152 |
+
'faisi': 'peyechi',
|
153 |
+
'faislo': 'pelo',
|
154 |
+
'faislam': 'pelam',
|
155 |
+
'faisla': 'pela',
|
156 |
+
'faimu': 'pabo',
|
157 |
+
'faibay': 'pabe',
|
158 |
+
'faibe': 'pabe',
|
159 |
+
'faibo': 'pabe',
|
160 |
+
'faiba': 'pabe',
|
161 |
+
'kaimu': 'khabo',
|
162 |
+
'kaibay': 'khabe',
|
163 |
+
'kaibe': 'khabe',
|
164 |
+
'kaibo': 'khabe',
|
165 |
+
'kaiba': 'khabe',
|
166 |
+
}
|
167 |
+
|
168 |
+
# 2. THE TRANSLATION FUNCTION
|
169 |
+
def translate_with_rules(user_input):
|
170 |
+
"""
|
171 |
+
This is the new core function. It translates a sentence
|
172 |
+
using the rule engine and lexicon.
|
173 |
+
It returns: (dialect_bengali, actual_bengali, benglish, explanation)
|
174 |
+
"""
|
175 |
+
# Step 1: Tokenize - split the sentence into words
|
176 |
+
input_words = user_input.lower().split()
|
177 |
+
translated_std_words = [] # This will hold the standard Bengali words
|
178 |
+
explanation = [] # This will explain the translation
|
179 |
+
|
180 |
+
# Step 2: Decode each word using the lexicon
|
181 |
+
for word in input_words:
|
182 |
+
# Check if the word is in the dialect lexicon
|
183 |
+
if word in dialect_lexicon:
|
184 |
+
std_word = dialect_lexicon[word]
|
185 |
+
translated_std_words.append(std_word)
|
186 |
+
explanation.append(f"'{word}' -> '{std_word}'")
|
187 |
+
else:
|
188 |
+
# If not found, keep the original word (it might be proper noun)
|
189 |
+
translated_std_words.append(word)
|
190 |
+
explanation.append(f"'{word}' -> ?")
|
191 |
+
|
192 |
+
# Step 3: Reconstruct the standard Bengali sentence
|
193 |
+
standard_sentence = " ".join(translated_std_words)
|
194 |
+
|
195 |
+
# For now, we'll use the input as dialect form since user typed it
|
196 |
+
dialect_sentence = user_input
|
197 |
+
# Benglish could be a simple phonetic version
|
198 |
+
benglish_sentence = user_input
|
199 |
+
|
200 |
+
return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation)
|
201 |
+
|
202 |
+
# =============================================================================
|
203 |
+
# END OF NEW TRANSLATION ENGINE
|
204 |
+
# =============================================================================
|
205 |
+
|
206 |
# Semantic mapping of dialect patterns to meanings + types
|
207 |
semantic_patterns = {
|
208 |
# question/particles
|
|
|
333 |
if q_lower == dialect.lower():
|
334 |
return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
|
335 |
|
336 |
+
# 2) NEW: Try to translate it using the RULE ENGINE
|
337 |
+
# Check if this is a simple phrase that can be broken down
|
338 |
+
dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)
|
339 |
+
|
340 |
+
# If the rule engine found translations for all words, use it!
|
341 |
+
if "?" not in explanation: # Basic check - if no unknown words
|
342 |
+
return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"
|
343 |
+
|
344 |
+
# 3) If input contains multiple phrases separated by punctuation
|
345 |
potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
|
346 |
if len(potential_phrases) > 1:
|
347 |
results = []
|
|
|
356 |
results.append(f"'{phrase}' → No match found")
|
357 |
return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
|
358 |
|
359 |
+
# 4) Semantic matches
|
360 |
semantic_matches = find_semantic_matches(q)
|
361 |
if semantic_matches:
|
362 |
# sort and return top semantic candidates
|
|
|
369 |
d, dialect_bengali, actual, benglish = phrases_data[best_idx]
|
370 |
return dialect_bengali, actual, benglish, suggestions
|
371 |
|
372 |
+
# 5) Partial matches in dialect strings
|
373 |
partial_matches = []
|
374 |
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
|
375 |
if q_lower in dialect.lower() or dialect.lower() in q_lower:
|
|
|
385 |
suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
|
386 |
return dialect_bengali, actual, benglish, suggestions
|
387 |
|
388 |
+
# 6) Close textual matches using difflib
|
389 |
close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
|
390 |
if close_matches:
|
391 |
indices = [dialects_lower.index(m) for m in close_matches]
|
|
|
395 |
suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
|
396 |
return dialect_bengali, actual, benglish, suggestions
|
397 |
|
398 |
+
# 7) Rule engine fallback (even with some unknown words)
|
399 |
+
return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])
|
|
|
400 |
|
401 |
except Exception as ex:
|
402 |
tb = traceback.format_exc()
|
|
|
441 |
with gr.Column(scale=1):
|
442 |
gr.Markdown("### Examples to try:")
|
443 |
examples = gr.Examples(
|
444 |
+
examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
|
445 |
inputs=inp,
|
446 |
label="Try these examples"
|
447 |
)
|