Spaces:
Sleeping
Sleeping
# app.py | |
""" | |
Enhanced Dialect Bengali Translator with Semantic Search | |
Uses both text similarity and semantic pattern matching | |
Updated to include new dialect patterns and polite/negative 'des/dis' behavior | |
""" | |
import difflib | |
import traceback | |
import gradio as gr | |
from collections import defaultdict | |
import re | |
# === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] === | |
phrases_data = [ | |
# Questions / common | |
["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"], | |
["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"], | |
["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"], | |
["oise", "ওইসে", "হয়েছে", "hoyeche"], | |
["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"], | |
["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"], | |
["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"], | |
["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"], | |
["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"], | |
["or", "ওর", "হচ্ছে", "hocche"], | |
["bala or", "বালা ওর", "ভালো হচ্ছে", "bhalo hocche"], | |
["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"], | |
["or je", "ওর যে", "হচ্ছে যে", "hocche je"], | |
["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"], | |
["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"], | |
["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"], | |
["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"], | |
["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"], | |
["tew", "তেও", "তাহলে", "tahole"], | |
["tente", "তেনতে", "তাহলে", "tahole"], | |
["to", "তো", "তাহলে", "tahole"], | |
["se hole", "সে হলে", "তাহলে", "tahole"], | |
["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"], | |
["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"], | |
["asoini", "আসইনি", "আছে কি?", "ache ki?"], | |
["ase", "আসে", "আছে", "ache"], | |
# Future / Present / Past core verbs (ja / de / fawa / ka) | |
["jaimu", "জাইমু", "যাব", "jabo"], | |
["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"], | |
["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"], | |
["jaibo", "জাইবো", "যাবে", "jabe"], | |
["jaiba", "জাইবা", "তারা যাবে", "tara jabe"], | |
["oibo", "ওইবো", "হবে", "hobe"], | |
["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"], | |
["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"], | |
["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"], | |
["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"], | |
# Give (de) family | |
["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"], | |
["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"], | |
["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"], | |
["dis na", "দিস না", "দেও না", "dis na (don't give)"], | |
["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"], | |
["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"], | |
["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"], | |
["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"], | |
["dibo", "দিবো", "দেব", "debo (will give)"], | |
["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"], | |
["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"], | |
# Get / receive (fawa) family | |
["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"], | |
["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"], | |
["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"], | |
["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"], | |
["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"], | |
["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"], | |
["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"], | |
["faimu", "ফাইমু", "পাব", "pabo (I will get)"], | |
["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"], | |
["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"], | |
["faibo", "ফাইবো", "সে পাবে", "se pabe"], | |
["faiba", "ফাইবা", "তারা পাবে", "tara pabe"], | |
# Eat (ka) family | |
["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"], | |
["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"], | |
["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"], | |
["kaibo", "কাইবো", "সে খাবে", "se khabe"], | |
["kaiba", "কাইবা", "তারা খাবে", "tara khabe"], | |
# Other sample sentences from user's corpus | |
["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"], | |
["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"], | |
["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"], | |
["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"], | |
["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"], | |
["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"], | |
["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"], | |
["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"], | |
["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"] | |
] | |
# ============================================================================= | |
# NEW RULE-BASED TRANSLATION ENGINE | |
# ============================================================================= | |
# 1. DIALECT LEXICON (A dictionary for unique words) | |
# This maps a dialect word to its standard Bengali meaning. | |
dialect_lexicon = { | |
'fua': 'chele', | |
'furi': 'meye', | |
'tara': 'tara', # It's the same! | |
'ora': 'tara', | |
'beta': 'purush', | |
'sogra': 'chele', | |
'bakka': 'pagol', | |
'kun': 'ke', # who | |
'ki': 'ki', # what | |
'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE | |
'kuno': 'kothay', # where (alternative spelling) | |
'jaibo': 'jabe', | |
'oibo': 'hobe', | |
'dibo': 'debe', | |
'lowna': 'nen', # you take (respectful) | |
'koin': 'bolun', # you say (respectful) | |
'disot': 'diyechile', # you gave (question form) | |
'faisot': 'peyechile', # you got (question form) | |
'ase': 'ache', | |
'or': 'hocche', | |
'oise': 'hoyeche', | |
'bala': 'bhalo', | |
'kub': 'onek', | |
'tik': 'thik', | |
'acha': 'thik', | |
'jen': 'je', | |
'ni': 'ki', | |
'kobor': 'khobor', | |
'korde': 'korchho', | |
'gesle': 'giyechile', | |
'oislo': 'hoyeche', | |
'oigese': 'hoyegese', | |
'jaimu': 'jabo', | |
'jaibay': 'jabe', | |
'jaibe': 'jabe', | |
'jaiba': 'jabe', | |
'des': 'dao', | |
'dis': 'na dio', | |
'dilaisi': 'diyechi', | |
'dilaise': 'diyeche', | |
'dilaisoin': 'diyechen', | |
'dise': 'dey', | |
'faisi': 'peyechi', | |
'faislo': 'pelo', | |
'faislam': 'pelam', | |
'faisla': 'pela', | |
'faimu': 'pabo', | |
'faibay': 'pabe', | |
'faibe': 'pabe', | |
'faibo': 'pabe', | |
'faiba': 'pabe', | |
'kaimu': 'khabo', | |
'kaibay': 'khabe', | |
'kaibe': 'khabe', | |
'kaibo': 'khabe', | |
'kaiba': 'khabe', | |
} | |
# 2. THE TRANSLATION FUNCTION | |
def translate_with_rules(user_input): | |
""" | |
This is the new core function. It translates a sentence | |
using the rule engine and lexicon. | |
It returns: (dialect_bengali, actual_bengali, benglish, explanation) | |
""" | |
# Step 1: Tokenize - split the sentence into words | |
input_words = user_input.lower().split() | |
translated_std_words = [] # This will hold the standard Bengali words | |
explanation = [] # This will explain the translation | |
# Step 2: Decode each word using the lexicon | |
for word in input_words: | |
# Check if the word is in the dialect lexicon | |
if word in dialect_lexicon: | |
std_word = dialect_lexicon[word] | |
translated_std_words.append(std_word) | |
explanation.append(f"'{word}' -> '{std_word}'") | |
else: | |
# If not found, keep the original word (it might be proper noun) | |
translated_std_words.append(word) | |
explanation.append(f"'{word}' -> ?") | |
# Step 3: Reconstruct the standard Bengali sentence | |
standard_sentence = " ".join(translated_std_words) | |
# For now, we'll use the input as dialect form since user typed it | |
dialect_sentence = user_input | |
# Benglish could be a simple phonetic version | |
benglish_sentence = user_input | |
return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation) | |
# ============================================================================= | |
# END OF NEW TRANSLATION ENGINE | |
# ============================================================================= | |
# Semantic mapping of dialect patterns to meanings + types | |
semantic_patterns = { | |
# question/particles | |
r"\bni\b": {"meaning": "কি", "type": "question"}, | |
r"\bni\b$": {"meaning": "কি", "type": "question"}, | |
# verbs / roots | |
r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"}, | |
r"\boise\b": {"meaning": "হয়েছে", "type": "verb"}, | |
r"\boibo\b": {"meaning": "হবে", "type": "verb"}, | |
r"\bjaimu\b": {"meaning": "যাব", "type": "verb"}, | |
r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"}, | |
r"\bkobor\b": {"meaning": "খবর", "type": "noun"}, | |
r"\bkorde\b": {"meaning": "করছে", "type": "verb"}, | |
r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"}, | |
r"\bbala\b": {"meaning": "ভালো", "type": "adjective"}, | |
r"\bkub\b": {"meaning": "অনেক", "type": "adverb"}, | |
r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"}, | |
r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"}, | |
r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"}, | |
r"\bjen\b": {"meaning": "যে", "type": "conjunction"}, | |
r"\bje\b": {"meaning": "যে", "type": "conjunction"}, | |
r"\btik\b": {"meaning": "ঠিক", "type": "adjective"}, | |
r"\base\b": {"meaning": "আছে", "type": "verb"}, | |
r"\basoin\b": {"meaning": "আছে", "type": "verb"}, | |
r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"}, | |
r"\bGoto\b": {"meaning": "গত", "type": "adjective"}, | |
r"\bkali\b": {"meaning": "কাল", "type": "noun"}, | |
r"\bkita\b": {"meaning": "কি", "type": "question"}, | |
r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"}, | |
# give/get polarity (important dialect contrast) | |
r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"}, | |
r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"}, | |
r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"}, | |
r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"}, | |
# fawa/get variants | |
r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"}, | |
r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"}, | |
r"\bfaimu\b": {"meaning": "পাব", "type": "verb"}, | |
r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"}, | |
# future pattern markers | |
r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"}, | |
r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"}, | |
r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"}, | |
r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"}, | |
} | |
# Precompute data structures for matching | |
dialects = [p[0] for p in phrases_data] | |
dialects_lower = [d.lower() for d in dialects] | |
actual_bengali_list = [p[2] for p in phrases_data] | |
# Create a mapping from dialect to full row | |
dialect_to_all = {p[0].lower(): p for p in phrases_data} | |
def semantic_analysis(user_input): | |
"""Perform semantic analysis on user input to understand meaning""" | |
user_lower = user_input.lower() | |
detected_patterns = [] | |
meaning_components = [] | |
# Use regex-based whole-word matching for patterns | |
for pattern, info in semantic_patterns.items(): | |
try: | |
if re.search(pattern, user_lower): | |
detected_patterns.append((pattern, info["meaning"], info["type"])) | |
meaning_components.append(info["meaning"]) | |
except re.error: | |
# If pattern is bad, skip it safely | |
continue | |
return detected_patterns, meaning_components | |
def find_semantic_matches(user_input, threshold=0.35): | |
"""Find matches based on semantic similarity + text similarity""" | |
user_lower = user_input.lower() | |
matches = [] | |
# Get semantic patterns from user input | |
detected_patterns, meaning_components = semantic_analysis(user_input) | |
# If we found semantic patterns, look for phrases with similar meanings | |
if meaning_components: | |
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data): | |
match_score = 0.0 | |
# boost if any of the meaning_components appear in actual or dialect | |
for meaning in meaning_components: | |
if meaning in actual: | |
match_score += 0.35 | |
if meaning in dialect.lower(): | |
match_score += 0.25 | |
# text similarity between user and dialect form | |
text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio() | |
total_score = match_score + (text_similarity * 0.5) | |
if total_score > threshold: | |
matches.append((i, total_score, "semantic")) | |
return matches | |
def format_suggestions_from_indices(indices, match_type="text", scores=None): | |
"""Helper to format suggestion list for the suggestions box""" | |
lines = [] | |
for i, idx in enumerate(indices): | |
d, dialect_bengali, actual, benglish = phrases_data[idx] | |
score_str = "" | |
if scores is not None and i < len(scores): | |
s_pct = int(scores[i] * 100) | |
score_str = f" ({match_type}-match: {s_pct}%)" | |
lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}") | |
return "\n\n".join(lines) | |
def translate_text(user_text, top_k: int = 6): | |
""" | |
Returns: (dialect_out, actual_out, benglish_out, suggestions_out) | |
""" | |
try: | |
q = (user_text or "").strip() | |
if not q: | |
return "", "", "", "Please enter a phrase or question." | |
q_lower = q.lower() | |
# 1) Exact match (case-insensitive) | |
for dialect, dialect_bengali, actual, benglish in phrases_data: | |
if q_lower == dialect.lower(): | |
return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)" | |
# 2) NEW: Try to translate it using the RULE ENGINE | |
# Check if this is a simple phrase that can be broken down | |
dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q) | |
# If the rule engine found translations for all words, use it! | |
if "?" not in explanation: # Basic check - if no unknown words | |
return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}" | |
# 3) If input contains multiple phrases separated by punctuation | |
potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()] | |
if len(potential_phrases) > 1: | |
results = [] | |
for phrase in potential_phrases: | |
matched = False | |
for d, dialect_bengali, actual, benglish in phrases_data: | |
if phrase.lower() == d.lower(): | |
results.append(f"{dialect_bengali} → {actual} → {benglish}") | |
matched = True | |
break | |
if not matched: | |
results.append(f"'{phrase}' → No match found") | |
return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results) | |
# 4) Semantic matches | |
semantic_matches = find_semantic_matches(q) | |
if semantic_matches: | |
# sort and return top semantic candidates | |
semantic_matches.sort(key=lambda x: x[1], reverse=True) | |
indices = [idx for idx, score, mt in semantic_matches[:top_k]] | |
scores = [score for idx, score, mt in semantic_matches[:top_k]] | |
suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores) | |
# Return best match as primary output | |
best_idx = indices[0] | |
d, dialect_bengali, actual, benglish = phrases_data[best_idx] | |
return dialect_bengali, actual, benglish, suggestions | |
# 5) Partial matches in dialect strings | |
partial_matches = [] | |
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data): | |
if q_lower in dialect.lower() or dialect.lower() in q_lower: | |
similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio() | |
partial_matches.append((i, similarity)) | |
if partial_matches: | |
partial_matches.sort(key=lambda x: x[1], reverse=True) | |
indices = [idx for idx, score in partial_matches[:top_k]] | |
scores = [score for idx, score in partial_matches[:top_k]] | |
best_idx = indices[0] | |
d, dialect_bengali, actual, benglish = phrases_data[best_idx] | |
suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores) | |
return dialect_bengali, actual, benglish, suggestions | |
# 6) Close textual matches using difflib | |
close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3) | |
if close_matches: | |
indices = [dialects_lower.index(m) for m in close_matches] | |
text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches] | |
best_idx = indices[0] | |
d, dialect_bengali, actual, benglish = phrases_data[best_idx] | |
suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores) | |
return dialect_bengali, actual, benglish, suggestions | |
# 7) Rule engine fallback (even with some unknown words) | |
return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]]) | |
except Exception as ex: | |
tb = traceback.format_exc() | |
return "", "", "", f"Runtime error:\n{str(ex)}\n\nTraceback:\n{tb}" | |
def show_semantic_analysis(user_text): | |
"""Show semantic analysis of user input""" | |
if not user_text.strip(): | |
return "" | |
patterns, meanings = semantic_analysis(user_text) | |
if patterns: | |
return f"Detected patterns: {', '.join([f'{p} → {m}' for p, m, t in patterns])}" | |
return "No specific patterns detected" | |
# Custom CSS for a softer, less blinding color scheme | |
css = """ | |
body { | |
font-family: Arial, sans-serif; | |
} | |
.gr-box { | |
border: 1px solid #e0e0e0; | |
border-radius: 8px; | |
} | |
.gr-button { | |
background: #4CAF50; | |
color: white; | |
} | |
.gr-button:hover { | |
background: #45a049; | |
} | |
""" | |
# Build Gradio UI with a softer theme | |
with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish") | |
gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.") | |
# Define input component first | |
inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### Examples to try:") | |
examples = gr.Examples( | |
examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"], | |
inputs=inp, | |
label="Try these examples" | |
) | |
with gr.Column(scale=2): | |
btn = gr.Button("Translate / Find", variant="primary") | |
with gr.Row(): | |
out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)") | |
out_actual = gr.Textbox(label="Actual Bengali (Standard)") | |
out_benglish = gr.Textbox(label="Benglish (Phonetic English)") | |
with gr.Row(): | |
semantic_info = gr.Textbox(label="Semantic Analysis", lines=2) | |
suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8) | |
# Set up event handlers | |
btn.click( | |
fn=translate_text, | |
inputs=[inp], | |
outputs=[out_dialect, out_actual, out_benglish, suggestions] | |
) | |
inp.change( | |
fn=show_semantic_analysis, | |
inputs=[inp], | |
outputs=[semantic_info] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |